In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('/content/drive/MyDrive/StudentsPerformance.csv')
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
data.columns = data.columns.str.replace(' ', '_').str.lower()

duplicates = data.duplicated().sum()
categorical_columns = ['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
numerical_columns = ['math_score', 'reading_score', 'writing_score']
descriptive_stats = data_encoded[numerical_columns].describe()

(duplicates, data_encoded.head(), descriptive_stats)

(np.int64(0),
    math_score  reading_score  writing_score  gender_male  \
 0          72             72             74        False   
 1          69             90             88        False   
 2          90             95             93        False   
 3          47             57             44         True   
 4          76             78             75         True   
 
    race/ethnicity_group B  race/ethnicity_group C  race/ethnicity_group D  \
 0                    True                   False                   False   
 1                   False                    True                   False   
 2                    True                   False                   False   
 3                   False                   False                   False   
 4                   False                    True                   False   
 
    race/ethnicity_group E  parental_level_of_education_bachelor's degree  \
 0                   False                                           Tr

In [4]:
processed_file_path = '/content/drive/MyDrive/processed_file_path.csv'
data_encoded.to_csv(processed_file_path, index=False)

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

output_dir = '/content/drive/MyDrive/results'
#  Box Plot of Math Scores by Gender
plt.figure(figsize=(8, 6))
sns.boxplot(x='gender_male', y='math_score', data=data_encoded)
plt.title('Math Scores by Gender')
plt.xlabel('Gender (Male)')
plt.ylabel('Math Score')
plt.xticks([0, 1], ['Female', 'Male'])
plt.savefig(f'{output_dir}/math_scores_by_gender.png')
plt.close()


#  Bar Plot of Average Scores by Race/Ethnicity
race_columns = ['race/ethnicity_group B', 'race/ethnicity_group C', 'race/ethnicity_group D', 'race/ethnicity_group E']
race_avg_scores = data_encoded.groupby(race_columns)[['math_score', 'reading_score', 'writing_score']].mean().mean(axis=1)
plt.figure(figsize=(10, 6))
race_avg_scores.plot(kind='bar')
plt.title('Average Scores by Race/Ethnicity')
plt.xlabel('Race/Ethnicity Group')
plt.ylabel('Average Score')
plt.savefig(f'{output_dir}/average_scores_by_race.png')
plt.close()


# Heatmap of Correlation Matrix

In [16]:

plt.figure(figsize=(10, 8))
corr = data_encoded.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.savefig(f'{output_dir}/correlation_matrix.png')
plt.close()



#Pie Chart of Test Preparation Course Completion

In [17]:

test_prep_counts = data_encoded['test_preparation_course_none'].value_counts()
plt.figure(figsize=(6, 6))
plt.pie(test_prep_counts, labels=['Completed', 'Not Completed'], autopct='%1.1f%%', startangle=90)
plt.title('Test Preparation Course Completion')
plt.savefig(f'{output_dir}/test_preparation_completion.png')
plt.close()



# Pair Plot of Scores

In [18]:

sns.pairplot(data_encoded[['math_score', 'reading_score', 'writing_score']])
plt.suptitle('Pair Plot of Scores', y=1.02)
plt.savefig(f'{output_dir}/pair_plot_of_scores.png')
plt.close()