### Imports


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


### Import data and see its shape

In [3]:
df = pd.read_csv('data/stud.csv')


In [4]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
df.shape

(1000, 8)

## Analysis to Perform

1. Check for missing values
2. check for duplicates
3. check data type
4. check number of unique values of each column
5. check statistics of data set
6. check various categeories present in different categeorical column

In [6]:
#1 check for missing values
# df.isnull().sum()
df.isna().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [7]:
#2 check for duplicates

df.duplicated().sum()


np.int64(0)

In [8]:
#3 check data type

df.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [9]:
#4 check number of unique values of each column
df.nunique()


gender                          2
race_ethnicity                  5
parental_level_of_education     6
lunch                           2
test_preparation_course         2
math_score                     81
reading_score                  72
writing_score                  77
dtype: int64

In [10]:
#5 check statistics of data set
df.describe()




Unnamed: 0,math_score,reading_score,writing_score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


#### Insights

1. All means are close to each other
2. All std are close to each other
3. Min is much higher for reading and writing compared to math

### Exploring Data

In [11]:
print("Categoeries available in gender: ", end=" ")
print(df['gender'].unique())

print("Categeories in race: ", end=" ")
print(df['race_ethnicity'].unique())

print("Categories in parental education: ", end=" ")
print(df['parental_level_of_education'].unique())

print("Categories in lunch: ", end=" ")
print(df['lunch'].unique())

print("Categories in test preparation: ", end=" ")
print(df['test_preparation_course'].unique())

# print("Categories in math score: ", end=" ")
# print(df['math_score'].unique())

# print("Categories in reading score: ", end=" ")
# print(df['reading_score'].unique())

# print("Categories in writing score: ", end=" ")
# print(df['writing_score'].unique())


Categoeries available in gender:  ['female' 'male']
Categeories in race:  ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in parental education:  ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in lunch:  ['standard' 'free/reduced']
Categories in test preparation:  ['none' 'completed']


In [13]:
# define numerical & categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df [feature].dtype == 'O']

# capital o repreesents an object
#print columns
print('We have {} numerical features: {}'.format (len (numeric_features), numeric_features))
print('\nWe have {} categorical features: {}'.format(len(categorical_features), categorical_features))

We have 3 numerical features: ['math_score', 'reading_score', 'writing_score']

We have 5 categorical features: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [16]:
df['total_score'] = df['math_score'] + df['reading_score'] + df['writing_score']
df['average_score'] = df['total_score']/3
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score,average_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333
