In [1]:
# 1.Load the dataset into a data frame using Pandas
import pandas as pd

# Load the dataset
email_data = pd.read_csv(r"C:\Users\Sarthak Kulkarni\Desktop\Hexaware Python Training\Data_engineering\Case-Study\Python(Case_Study)\email.csv")

# Display the first few rows
email_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
# 2.Explore the number of rows & columns, ranges of values etc.

# Number of rows and columns
print("Shape of the dataset:", email_data.shape)

# Column names and data types
print("\nColumn information:")
print(email_data.info())

# Summary statistics for numerical columns
print("\nSummary statistics for numerical columns:")
print(email_data.describe())

# Checking the range of values in numerical columns
print("\nRange of values in numerical columns:")
for column in email_data.select_dtypes(include='number').columns:
    print(f"{column}: {email_data[column].min()} to {email_data[column].max()}")

# Check for missing values
print("\nMissing values in each column:")
print(email_data.isnull().sum())

# Preview unique values in categorical columns
print("\nUnique values in categorical columns:")
for column in email_data.select_dtypes(include='object').columns:
    print(f"{column}: {email_data[column].nunique()} unique values")


Shape of the dataset: (5573, 2)

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None

Summary statistics for numerical columns:
       Category                 Message
count      5573                    5573
unique        3                    5158
top         ham  Sorry, I'll call later
freq       4825                      30

Range of values in numerical columns:

Missing values in each column:
Category    0
Message     0
dtype: int64

Unique values in categorical columns:
Category: 3 unique values
Message: 5158 unique values


In [3]:
# 3.Handle missing, incorrect and invalid data

# Check for missing values
print("Missing values before handling:")
print(email_data.isnull().sum())

# Fill missing values for numerical columns with the mean
email_data.fillna(email_data.mean(numeric_only=True), inplace=True)

# Fill missing values for categorical columns with a placeholder or mode
for column in email_data.select_dtypes(include='object').columns:
    email_data[column].fillna('Unknown', inplace=True)

print("\nMissing values after handling:")
print(email_data.isnull().sum())

# Handling Incorrect or Invalid Data
for column in email_data.select_dtypes(include='number').columns:
    email_data[column] = email_data[column].apply(lambda x: None if x < 0 else x)

# Drop or impute invalid values after replacement
email_data.fillna(email_data.mean(numeric_only=True), inplace=True)

# Display summary after cleaning
print("\nDataset summary after handling invalid data:")
print(email_data.info())


Missing values before handling:
Category    0
Message     0
dtype: int64

Missing values after handling:
Category    0
Message     0
dtype: int64

Dataset summary after handling invalid data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [4]:
# 4.Perform any additional steps (parsing dates, creating additional columns, merging multiple dataset etc.)

from datetime import datetime

# Load the datasets
file_path = r"C:\Users\Sarthak Kulkarni\Desktop\Hexaware Python Training\Data_engineering\Case-Study\Python(Case_Study)\email.csv"
other_file_path = r"C:\Users\Sarthak Kulkarni\Desktop\Hexaware Python Training\Data_engineering\Case-Study\Python(Case_Study)\spam.csv"

email_data = pd.read_csv(file_path)
other_data = pd.read_csv(other_file_path)

# Display dataset previews
print("Email Dataset Preview:")
print(email_data.head())

print("\nOther Dataset Preview:")
print(other_data.head())

# Add additional columns to the email dataset
email_data['processing_date'] = datetime.now().date()
email_data['message_length'] = email_data['Message'].str.len()
email_data['is_spam'] = email_data['Category'].apply(lambda x: 1 if x == 'spam' else 0)

# Merge the datasets using a common column (e.g., 'Message')
try:
    merged_data = pd.merge(email_data, other_data, on='Message', how='inner')  # Use the actual column name here
    print("\nMerged Dataset Preview:")
    print(merged_data.head())
except KeyError as e:
    print(f"KeyError: {e}. Ensure the column name exists in both datasets.")

# Final Summary
print("\nFinal Dataset Info:")
print(email_data.info())


Email Dataset Preview:
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...

Other Dataset Preview:
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...

Merged Dataset Preview:
  Category_x                                            Message  \
0        ham  Go until jurong point, crazy.. Available only ...   
1        ham                      Ok lar... Joking

In [5]:
# 5.Write the panda and Numpy queries on the EDA Of Data

# Count unique values in each column
print("\nUnique Values in Each Column:")
for column in email_data.columns:
    print(f"{column}: {email_data[column].nunique()} unique values")

# Frequency of categories in the 'Category' column
print("\nValue Counts for 'Category':")
print(email_data['Category'].value_counts())



Unique Values in Each Column:
Category: 3 unique values
Message: 5158 unique values
processing_date: 1 unique values
message_length: 276 unique values
is_spam: 2 unique values

Value Counts for 'Category':
Category
ham               4825
spam               747
{"mode":"full"       1
Name: count, dtype: int64
