In [73]:
import pandas as pd
import os
from utils import drop_categorical_columns
from utils import missing_values_table

In [74]:
def drop_label_with_null(df, column_name):
    # Drop rows where the specified column is null
    df_cleaned = df.dropna(subset=[column_name])

    return df_cleaned

In [75]:
path = os.path.join('Final-Datasets', 'HMEQ.csv')
original = pd.read_csv(path,index_col=0)
original.shape


(8783, 17)

In [76]:
dropped=drop_label_with_null(original, 'Happy with online education?')
dropped.shape

(8783, 17)

In [77]:
dropped.head(3)

Unnamed: 0,Level of study?,Age?,Used smartphone/computer/laptop previously before online class?,Result increased after online education (comparatively)?,Knowledge increased after online education (comparatively)?,Happy with online education?,Education Institute Area?,Have Internet availability?,Broadband / Mobile Internet?,Total hours of study before online education?,Total hours of study after online education?,Class performance increased in online education?,Institute Type,Current location (During Study) ?,Gender,Faced any issue with online class?,Preferred device for an online course
0,Upto HSC,20.0,Yes,No,Yes,No,Urban,No,Broadband,4,3,No,Public,Rural,Male,Yes,Mobile
1,Hons or Grater,25.0,No,No,No,No,Urban,No,Mobile Internet,4,4,No,Public,Rural,Male,Yes,Mobile
2,Hons or Grater,25.0,Yes,Yes,Yes,Yes,Rural,No,Mobile Internet,5,2,Yes,Public,Rural,Female,Yes,Computer


In [78]:
categorical_columns = dropped.select_dtypes(include=['object', 'category']).columns.tolist()
print(len(categorical_columns))
for col in categorical_columns:
    dropped[col] = pd.to_numeric(dropped[col], errors='ignore')
# after those are taken care of we can drop the columns that are still object
categorical_columns = dropped.select_dtypes(include=['object', 'category']).columns.tolist()
print(categorical_columns)

14
['Level of study?', 'Used smartphone/computer/laptop previously before online class?', 'Result increased after online education  (comparatively)?', 'Knowledge increased after online education (comparatively)?', 'Happy with online education?', 'Education Institute Area?', 'Have Internet availability?', 'Broadband / Mobile Internet?', 'Class performance increased in online education?', 'Institute Type', 'Current location (During Study) ?', 'Gender', 'Faced any issue with online class?', 'Preferred device for an online course']


In [79]:
missing_values_table(dropped)

Your selected dataframe has 17 columns.
There are 8 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Data Type
Institute Type,726,8.3,object
Current location (During Study) ?,726,8.3,object
Faced any issue with online class?,701,8.0,object
Gender,676,7.7,object
Education Institute Area?,529,6.0,object
Age?,445,5.1,float64
Result increased after online education (comparatively)?,323,3.7,object
Used smartphone/computer/laptop previously before online class?,188,2.1,object


In [80]:
#others then handle nullls

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
df=dropped
# Select categorical columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Create a copy of the original dataframe to preserve the original columns
df_encoded = df.copy()

# Create an empty DataFrame to store the encoded columns
encoded_data_frames = []

# Define a list of transformers for each categorical column
for col in categorical_columns:
    # Replace null values with "Others" in the original column
    df_encoded[col].fillna("Others", inplace=True)

    # Create dummy variables using OneHotEncoder
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore',drop='first')
    encoded_data = encoder.fit_transform(df_encoded[[col]])

    # Get column names based on the original column and category
    column_names = [category for category in encoder.get_feature_names_out([col])]

    # Create a DataFrame for the encoded data
    encoded_df = pd.DataFrame(encoded_data, columns=column_names, index=df_encoded.index)

    try:
      new_column_name = f"{col}_{'Others'}"
      # Drop the original column and the dummy variables for "Others"

      encoded_df.loc[df[col].isnull(), column_names] = pd.NA
      encoded_df.drop([new_column_name], axis=1, inplace=True)
    except:
      print('This column doesnt have nulls',col)

    # Add the encoded DataFrame to the list
    encoded_data_frames.append(encoded_df)

# Concatenate the original DataFrame and the list of encoded DataFrames
df_encoded = pd.concat([df_encoded] + encoded_data_frames, axis=1)

# Drop the original categorical columns
df_encoded.drop(categorical_columns, axis=1, inplace=True)

# Now, the df_encoded DataFrame contains the changes made to categorical columns, with null values preserved.

This column doesnt have nulls Level of study?
This column doesnt have nulls Knowledge increased after online education (comparatively)?
This column doesnt have nulls Happy with online education?
This column doesnt have nulls Education Institute Area?
This column doesnt have nulls Have Internet availability?
This column doesnt have nulls Broadband / Mobile Internet?
This column doesnt have nulls Class performance increased in online education?
This column doesnt have nulls Institute Type
This column doesnt have nulls Current location (During Study) ?
This column doesnt have nulls Preferred device for an online course




In [81]:
missing_values_table(df)

Your selected dataframe has 17 columns.
There are 8 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Data Type
Institute Type,726,8.3,object
Current location (During Study) ?,726,8.3,object
Faced any issue with online class?,701,8.0,object
Gender,676,7.7,object
Education Institute Area?,529,6.0,object
Age?,445,5.1,float64
Result increased after online education (comparatively)?,323,3.7,object
Used smartphone/computer/laptop previously before online class?,188,2.1,object


In [82]:
missing_values_table(df_encoded)

Your selected dataframe has 20 columns.
There are 11 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values,Data Type
Institute Type_Private,726,8.3,float64
Institute Type_Public,726,8.3,float64
Current location (During Study) ?_Rural,726,8.3,float64
Current location (During Study) ?_Urban,726,8.3,float64
Faced any issue with online class?_Yes,701,8.0,float64
Gender_Male,676,7.7,float64
Education Institute Area?_Rural,529,6.0,float64
Education Institute Area?_Urban,529,6.0,float64
Age?,445,5.1,float64
Result increased after online education (comparatively)?_Yes,323,3.7,float64


In [83]:
df_encoded.head(15)

Unnamed: 0,Age?,Total hours of study before online education?,Total hours of study after online education?,Level of study?_Upto HSC,Used smartphone/computer/laptop previously before online class?_Yes,Result increased after online education (comparatively)?_Yes,Knowledge increased after online education (comparatively)?_Yes,Happy with online education?_Yes,Education Institute Area?_Rural,Education Institute Area?_Urban,Have Internet availability?_Yes,Broadband / Mobile Internet?_Mobile Internet,Class performance increased in online education?_Yes,Institute Type_Private,Institute Type_Public,Current location (During Study) ?_Rural,Current location (During Study) ?_Urban,Gender_Male,Faced any issue with online class?_Yes,Preferred device for an online course_Mobile
0,20.0,4,3,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
1,25.0,4,4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
2,25.0,5,2,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
3,21.0,5,3,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
4,22.0,4,2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
5,25.0,3,2,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
6,24.0,5,3,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
7,24.0,4,3,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
8,22.0,6,3,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
9,21.0,6,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [84]:
df_encoded.to_csv(os.path.join('Final-Datasets', 'HMEQ-processed.csv'),index=False)