In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

# Print the current working directory
print("Current Working Directory: ", os.getcwd())

# Change the current working directory
os.chdir('C:/Users/naren/Desktop/DS/Projects/MLPROJECT')
print("New Working Directory: ", os.getcwd())

Current Working Directory:  c:\Users\naren\Desktop\DS\Projects\MLPROJECT\notebooks\eda
New Working Directory:  C:\Users\naren\Desktop\DS\Projects\MLPROJECT


In [3]:
# Define the path to the CSV file
data_path = 'data/uncleaned/students.csv'

# Load the data
data = pd.read_csv(data_path)

# Display the first few rows of the dataframe
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72.0,72.0,74.0
1,female,group C,some college,standard,completed,69.0,90.0,88.0
2,female,group B,master's degree,standard,none,90.0,95.0,93.0
3,male,group A,associate's degree,free/reduced,none,47.0,57.0,44.0
4,male,group C,some college,standard,none,76.0,78.0,75.0


In [5]:
all_columns = data.columns
print(all_columns)

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')


In [11]:
# Define the expected columns
required_columns = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course', 'math_score', 'reading_score', 'writing_score']

In [12]:
# Check for missing columns by comparing the required columns to the available ones
missing_columns = [col for col in required_columns if col not in all_columns]
if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns are present.")

All required columns are present.


In [13]:
# Validate the data types of the columns (checking against expected types dynamically)
expected_dtypes = {
    'gender': 'object',
    'race_ethnicity': 'object',
    'parental_level_of_education': 'object',
    'lunch': 'object',
    'test_preparation_course': 'object',
    'math_score': 'float64',
    'reading_score': 'float64',
    'writing_score': 'float64'
}

# Checking column data types
for column in all_columns:
    if column in expected_dtypes:
        actual_type = data[column].dtype
        expected_type = expected_dtypes[column]
        if actual_type != expected_type:
            print(f"Column '{column}' should be of type {expected_type}, but found {actual_type}.")
        else:
            print(f"Column '{column}' is of the correct type: {expected_type}.")

Column 'gender' is of the correct type: object.
Column 'race_ethnicity' is of the correct type: object.
Column 'parental_level_of_education' is of the correct type: object.
Column 'lunch' is of the correct type: object.
Column 'test_preparation_course' is of the correct type: object.
Column 'math_score' is of the correct type: float64.
Column 'reading_score' is of the correct type: float64.
Column 'writing_score' is of the correct type: float64.


In [6]:
# Check for missing values
missing_values = data.isnull().sum()
if missing_values.any():
    print(f"Columns with missing values:\n{missing_values[missing_values > 0]}")
else:
    print("No missing values detected.")

Columns with missing values:
gender                         2
race_ethnicity                 2
parental_level_of_education    1
lunch                          1
math_score                     1
reading_score                  1
writing_score                  1
dtype: int64


In [7]:
# Check for duplicates
duplicate_rows = data.duplicated().sum()
if duplicate_rows > 0:
    print(f"There are {duplicate_rows} duplicate rows in the data.")
else:
    print("No duplicate rows found.")

There are 2 duplicate rows in the data.


In [8]:
# Check unique values for categorical columns
categorical_cols = data.select_dtypes(include='object').columns
for col in categorical_cols:
    print(f"Unique values in '{col}': {data[col].unique()}")

Unique values in 'gender': ['female' 'male' nan]
Unique values in 'race_ethnicity': ['group B' 'group C' 'group A' 'group D' nan 'group E']
Unique values in 'parental_level_of_education': ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school' nan]
Unique values in 'lunch': ['standard' 'free/reduced' nan]
Unique values in 'test_preparation_course': ['none' 'completed']


In [9]:
# Check numeric columns' ranges
numeric_cols = data.select_dtypes(include='number').columns
for col in numeric_cols:
    min_value = data[col].min()
    max_value = data[col].max()
    print(f"Column '{col}' has min value: {min_value} and max value: {max_value}")

Column 'math_score' has min value: 0.0 and max value: 100.0
Column 'reading_score' has min value: 17.0 and max value: 100.0
Column 'writing_score' has min value: 10.0 and max value: 100.0


In [10]:
# Check shape of data
print(f"Data has {data.shape[0]} rows and {data.shape[1]} columns.")

Data has 1002 rows and 8 columns.
