### Data Collection

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('C:\SUREN_NEW\MACHINE_LEARNING-PROJECTS\DATA\StudentsPerformance.csv')

In [None]:
df.head()

In [None]:
df.shape

### Dataset Cleaning

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
# Check Null and Dtypes
df.info()

In [None]:
df.nunique()

In [None]:
df.describe()

### Exploring Data

In [None]:
df.head()

In [None]:
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  ",end=" ")
print(df['race/ethnicity'].unique())

print("Categories in'parental level of education' variable:",end=" " )
print(df['parental level of education'].unique())

print("Categories in 'lunch' variable:     ",end=" " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     ",end=" " )
print(df['test preparation course'].unique())

In [None]:
df['gender'].dtype
# define numerical & categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

### FEATURE ENGINEERING

In [None]:
df['total score'] = df['math score'] + df['reading score'] + df['writing score']
df['average'] = df['total score']/3
df.head()

In [None]:
print(df['math score']==100)
print(df[df['math score']==100]['average'])
print(df[df['math score']==100]['average'].count())

In [None]:
reading_full = df[df['reading score'] == 100]['average'].count()
writing_full = df[df['writing score'] == 100]['average'].count()
math_full = df[df['math score'] == 100]['average'].count()

print(f'Number of students with full marks in Maths: {math_full}')
print(f'Number of students with full marks in Writing: {writing_full}')
print(f'Number of students with full marks in Reading: {reading_full}')

In [None]:
reading_less_20 = df[df['reading score'] <= 20]['average'].count()
writing_less_20 = df[df['writing score'] <= 20]['average'].count()
math_less_20 = df[df['math score'] <= 20]['average'].count()

print(f'Number of students with less than 20 marks in Maths: {math_less_20}')
print(f'Number of students with less than 20 marks in Writing: {writing_less_20}')
print(f'Number of students with less than 20 marks in Reading: {reading_less_20}')

### DATA VISUALIZATION(EDA)

In [None]:
sns.histplot(data=df,x='average',hue='gender',kde=True)
# df where x is average gives us 2 separate datas of m and f containing avg (kde is the line helps to conclude)

In [None]:
sns.histplot(data=df,x='total score',hue='gender',kde=True)
# df where x is total score gives us 2 separate datas of m and f containing total

In [None]:
sns.histplot(data=df,x='average',hue='lunch',kde=True)
# sns.histplot(data=df[df.gender=='male'],x='average',hue='lunch',kde=True)
# sns.histplot(data=df[df.gender=='female'],x='average',hue='lunch',kde=True)

In [None]:
sns.histplot(data=df,x='average',kde=True,hue='parental level of education')
# sns.histplot(data=df[df.gender=='male'],x='average',kde=True,hue='parental level of education')
# sns.histplot(data=df[df.gender=='female'],x='average',kde=True,hue='parental level of education')

In [None]:
Insight
we can see that parental level of education effects the student performance

In [None]:
sns.histplot(data=df,x='average',kde=True,hue='race/ethnicity')
# sns.histplot(data=df[df.gender=='male'],x='average',kde=True,hue='race/ethnicity')
# sns.histplot(data=df[df.gender=='female'],x='average',kde=True,hue='race/ethnicity')

In [None]:
sns.violinplot(y='math score',data=df)
# sns.violinplot(y='writing score',data=df)
# sns.violinplot(y='reading score',data=df)

In [None]:
plt.rcParams['figure.figsize'] = (30, 12)

plt.subplot(1, 5, 1)
size = df['gender'].value_counts()
labels = 'Female', 'Male'
color = ['red','green']


plt.pie(size, colors = color, labels = labels,autopct = '.%2f%%')
plt.title('Gender', fontsize = 20)
plt.axis('off')



plt.subplot(1, 5, 2)
size = df['race/ethnicity'].value_counts()
labels = 'Group C', 'Group D','Group B','Group E','Group A'
color = ['red', 'green', 'blue', 'cyan','orange']

plt.pie(size, colors = color,labels = labels,autopct = '.%2f%%')
plt.title('Race/Ethnicity', fontsize = 20)
plt.axis('off')



plt.subplot(1, 5, 3)
size = df['lunch'].value_counts()
labels = 'Standard', 'Free'
color = ['red','green']

plt.pie(size, colors = color,labels = labels,autopct = '.%2f%%')
plt.title('Lunch', fontsize = 20)
plt.axis('off')


plt.subplot(1, 5, 4)
size = df['test preparation course'].value_counts()
labels = 'None', 'Completed'
color = ['red','green']

plt.pie(size, colors = color,labels = labels,autopct = '.%2f%%')
plt.title('Test Course', fontsize = 20)
plt.axis('off')


plt.subplot(1, 5, 5)
size = df['parental level of education'].value_counts()
labels = 'Some College', "Associate's Degree",'High School','Some High School',"Bachelor's Degree","Master's Degree"
color = ['red', 'green', 'blue', 'cyan','orange','grey']

plt.pie(size, colors = color,labels = labels,autopct = '.%2f%%')
plt.title('Parental Education', fontsize = 20)
plt.axis('off')


plt.tight_layout()
plt.grid()

plt.show()


In [None]:
sns.barplot(data=df,x='gender',y='average',hue='gender',width=0.3)
plt.tick_params(axis='x', labelsize=50)  
plt.tick_params(axis='y', labelsize=50)
# # sns.barplot(data=df,x='gender',y='total score',hue='gender')
# sns.barplot(data=df,x='gender',y='math score',hue='gender')
# sns.barplot(data=df,x='gender',y='writing score',hue='gender')
# sns.barplot(data=df,x='gender',y='reading score',hue='gender')
# widht for adjust size of bars

In [None]:
sns.barplot (x=df['gender'], y=df['math score'], hue=df['test preparation course'])
plt.tick_params(axis='x', labelsize=50)  
plt.tick_params(axis='y', labelsize=50)
# sns.barplot (x=df['gender'], y=df['reading score'], hue=df['test preparation course'])
# sns.barplot (x=df['gender'], y=df['writing score'], hue=df['test preparation course'])

In [None]:
sns.boxplot(data=df,x='math score')
plt.tick_params(axis='x', labelsize=50)  
plt.tick_params(axis='y', labelsize=50)
# # sns.boxplot(data=df,x='reading score')
# sns.boxplot(data=df,x='writing score')

In [None]:
sns.pairplot(df,hue='gender')