In [1]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values
import pandas as pd
import numpy as np
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda', None],
    'Age': [28, 34, None, 41, 37],
    'City': ['New York', None, 'Boston', 'Chicago', 'Miami'],
    'Salary': [75000, 65000, 80000, None, 90000]
}
df = pd.DataFrame(data)
df.to_csv('sample_data.csv', index=False)
print("Sample CSV file created with the following data:")
print(df)
print("\n")
df = pd.DataFrame(pd.read_csv('sample_data.csv'))
print("Data loaded from CSV:")
print(df)
print("\n")
print("Missing values (True indicates missing):")
print(df.isnull())
print("\n")
print("Count of missing values in each column:")
print(df.isnull().sum())
print("\n")
print("Total missing values in the DataFrame:", df.isnull().sum().sum())
print("\n")
print("Percentage of missing values in each column:")
print(df.isnull().mean() * 100)
print("\n")
print("Rows with at least one missing value:")
print(df[df.isnull().any(axis=1)])
print("\n")
print("Rows with complete data:")
print(df[~df.isnull().any(axis=1)])
      

Sample CSV file created with the following data:
    Name   Age      City   Salary
0   John  28.0  New York  75000.0
1   Anna  34.0      None  65000.0
2  Peter   NaN    Boston  80000.0
3  Linda  41.0   Chicago      NaN
4   None  37.0     Miami  90000.0


Data loaded from CSV:
    Name   Age      City   Salary
0   John  28.0  New York  75000.0
1   Anna  34.0       NaN  65000.0
2  Peter   NaN    Boston  80000.0
3  Linda  41.0   Chicago      NaN
4    NaN  37.0     Miami  90000.0


Missing values (True indicates missing):
    Name    Age   City  Salary
0  False  False  False   False
1  False  False   True   False
2  False   True  False   False
3  False  False  False    True
4   True  False  False   False


Count of missing values in each column:
Name      1
Age       1
City      1
Salary    1
dtype: int64


Total missing values in the DataFrame: 4


Percentage of missing values in each column:
Name      20.0
Age       20.0
City      20.0
Salary    20.0
dtype: float64


Rows with at least o

In [2]:
#corrected  Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, 28],
    'Salary': [50000, 60000, np.nan, 70000]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
df_cleaned = df.dropna()
print("\nDataFrame After Dropping Rows with Missing Values:")
print(df_cleaned)


Original DataFrame:
      Name   Age   Salary
0    Alice  25.0  50000.0
1      Bob   NaN  60000.0
2  Charlie  30.0      NaN
3    David  28.0  70000.0

DataFrame After Dropping Rows with Missing Values:
    Name   Age   Salary
0  Alice  25.0  50000.0
3  David  28.0  70000.0


In [3]:
# corrected Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, 28],
    'Salary': [50000, 60000, np.nan, 70000],
    'Department': [None, None, None, None]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
df_cleaned = df.dropna(axis=1)
print("\nDataFrame After Dropping Columns with Missing Values:")
print(df_cleaned)



Original DataFrame:
      Name   Age   Salary Department
0    Alice  25.0  50000.0       None
1      Bob   NaN  60000.0       None
2  Charlie  30.0      NaN       None
3    David  28.0  70000.0       None

DataFrame After Dropping Columns with Missing Values:
      Name
0    Alice
1      Bob
2  Charlie
3    David


In [4]:
# correted Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, 28]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)
print("\nDataFrame After Mean Imputation:")
print(df)


Original DataFrame:
      Name   Age
0    Alice  25.0
1      Bob   NaN
2  Charlie  30.0
3    David  28.0

DataFrame After Mean Imputation:
      Name        Age
0    Alice  25.000000
1      Bob  27.666667
2  Charlie  30.000000
3    David  28.000000


In [5]:
# correted Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.
import pandas as pd
import numpy as np
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Department': ['HR', np.nan, 'IT', 'HR']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
mode_value = df['Department'].mode()[0]  
df['Department'] = df['Department'].fillna(mode_value)
print("\nDataFrame After Mode Imputation:")
print(df)


Original DataFrame:
      Name Department
0    Alice         HR
1      Bob        NaN
2  Charlie         IT
3    David         HR

DataFrame After Mode Imputation:
      Name Department
0    Alice         HR
1      Bob         HR
2  Charlie         IT
3    David         HR


In [6]:
# correted Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.
import pandas as pd
import numpy as np
data = {
    'Income': [50000, 60000, 200000, 250000, np.nan, 80000, np.nan],
    'Age': [25, 30, 35, 40, 28, np.nan, 30]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
income_median = df['Income'].median()
df['Income'] = df['Income'].fillna(income_median)
age_median = df['Age'].median()
df['Age'] = df['Age'].fillna(age_median)
print("\nDataFrame After Median Imputation:")
print(df)


Original DataFrame:
     Income   Age
0   50000.0  25.0
1   60000.0  30.0
2  200000.0  35.0
3  250000.0  40.0
4       NaN  28.0
5   80000.0   NaN
6       NaN  30.0

DataFrame After Median Imputation:
     Income   Age
0   50000.0  25.0
1   60000.0  30.0
2  200000.0  35.0
3  250000.0  40.0
4   80000.0  28.0
5   80000.0  30.0
6   80000.0  30.0


In [7]:
#  corrected Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
data = {
    'Age': [25, np.nan, 30, 35, 40],
    'Salary': [50000, 60000, np.nan, 80000, 85000],
    'Experience': [2, 5, 10, 7, 12]
}
df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
print(df)
imputer = KNNImputer(n_neighbors=2)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print("\nDataFrame After KNN Imputation:")
print(df_imputed)


Original DataFrame with Missing Values:
    Age   Salary  Experience
0  25.0  50000.0           2
1   NaN  60000.0           5
2  30.0      NaN          10
3  35.0  80000.0           7
4  40.0  85000.0          12

DataFrame After KNN Imputation:
    Age   Salary  Experience
0  25.0  50000.0         2.0
1  27.5  60000.0         5.0
2  30.0  70000.0        10.0
3  35.0  80000.0         7.0
4  40.0  85000.0        12.0


In [8]:
#correted  Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.
import pandas as pd
import numpy as np
data = {
    'Category': ['A', 'B', 'A', np.nan, 'B', 'C', 'A', np.nan, 'C', 'C'],
    'Value': [10, 15, 10, 25, 30, 35, 40, 50, 60, 70]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
missing_categories = df['Category'].isnull()
print("\nMissing Values in 'Category' Column:")
print(missing_categories)
mode_value = df['Category'].mode()[0]
df['Category'] = df['Category'].fillna(mode_value)
print("\nDataFrame After Filling Missing Categorical Values:")
print(df)



Original DataFrame:
  Category  Value
0        A     10
1        B     15
2        A     10
3      NaN     25
4        B     30
5        C     35
6        A     40
7      NaN     50
8        C     60
9        C     70

Missing Values in 'Category' Column:
0    False
1    False
2    False
3     True
4    False
5    False
6    False
7     True
8    False
9    False
Name: Category, dtype: bool

DataFrame After Filling Missing Categorical Values:
  Category  Value
0        A     10
1        B     15
2        A     10
3        A     25
4        B     30
5        C     35
6        A     40
7        A     50
8        C     60
9        C     70


In [9]:
# coreted Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
data = {
    'Age': [25, 30, 35, np.nan, 40, 45, np.nan, 50],
    'Salary': [50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000],
    'Experience': [2, 5, 8, 10, 12, 15, 17, 20]
}
df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:")
print(df)
train_df = df[df['Age'].notna()]  
test_df = df[df['Age'].isna()]  
X_train = train_df[['Salary', 'Experience']]  
y_train = train_df['Age']  
model = LinearRegression()
model.fit(X_train, y_train)
X_test = test_df[['Salary', 'Experience']]  
predicted_ages = model.predict(X_test)
df.loc[df['Age'].isna(), 'Age'] = predicted_ages
print("\nDataFrame After Predictive Imputation:")
print(df)




Original DataFrame with Missing Values:
    Age  Salary  Experience
0  25.0   50000           2
1  30.0   60000           5
2  35.0   70000           8
3   NaN   80000          10
4  40.0   90000          12
5  45.0  100000          15
6   NaN  110000          17
7  50.0  120000          20

DataFrame After Predictive Imputation:
         Age  Salary  Experience
0  25.000000   50000           2
1  30.000000   60000           5
2  35.000000   70000           8
3  37.123288   80000          10
4  40.000000   90000          12
5  45.000000  100000          15
6  46.335616  110000          17
7  50.000000  120000          20


In [10]:
# correted Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.
import pandas as pd
import numpy as np
data = {
    'Date': pd.date_range(start='2025-01-01', periods=10, freq='D'),
    'Temperature': [22, np.nan, np.nan, 24, 25, np.nan, 27, np.nan, 28, 29]
}
df = pd.DataFrame(data)
print("Original Time Series DataFrame with Missing Values:")
print(df)
df = df.sort_values(by='Date')
df_ffill = df.copy()
df_ffill['Temperature'] = df_ffill['Temperature'].fillna(method='ffill')
df_bfill = df.copy()
df_bfill['Temperature'] = df_bfill['Temperature'].fillna(method='bfill')
print("\nTime Series After Forward Fill:")
print(df_ffill)
print("\nTime Series After Backward Fill:")
print(df_bfill)


Original Time Series DataFrame with Missing Values:
        Date  Temperature
0 2025-01-01         22.0
1 2025-01-02          NaN
2 2025-01-03          NaN
3 2025-01-04         24.0
4 2025-01-05         25.0
5 2025-01-06          NaN
6 2025-01-07         27.0
7 2025-01-08          NaN
8 2025-01-09         28.0
9 2025-01-10         29.0

Time Series After Forward Fill:
        Date  Temperature
0 2025-01-01         22.0
1 2025-01-02         22.0
2 2025-01-03         22.0
3 2025-01-04         24.0
4 2025-01-05         25.0
5 2025-01-06         25.0
6 2025-01-07         27.0
7 2025-01-08         27.0
8 2025-01-09         28.0
9 2025-01-10         29.0

Time Series After Backward Fill:
        Date  Temperature
0 2025-01-01         22.0
1 2025-01-02         24.0
2 2025-01-03         24.0
3 2025-01-04         24.0
4 2025-01-05         25.0
5 2025-01-06         27.0
6 2025-01-07         27.0
7 2025-01-08         28.0
8 2025-01-09         28.0
9 2025-01-10         29.0


  df_ffill['Temperature'] = df_ffill['Temperature'].fillna(method='ffill')
  df_bfill['Temperature'] = df_bfill['Temperature'].fillna(method='bfill')
