In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Removing Duplicates from the Dataset:

In [4]:
# COUNTING THE NUMBER OF DUPLICATE ENTRIES

num_duplicates = df.duplicated().sum()

In [5]:
print("Number of duplicates found:", num_duplicates)

Number of duplicates found: 0


In [33]:
# AS THERE ARE NO DUPLICATES IN OUR DATA WE DONT HAVE TO DELETE THEM

# TO DELETE DUPLCIATES WE USE THE FOLLOWING COMMAND:

#df = df.drop_duplicates()

## Handling Missing Values in the Dataset:

In [34]:
df.isnull()

# There are some null values present in the data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,False,False,False,False,True,False
887,False,False,False,False,False,False,False,False,False,False,False,False
888,False,False,False,False,False,True,False,False,False,False,True,False
889,False,False,False,False,False,False,False,False,False,False,False,False


In [35]:
# This following command tells the frequency of missing data in each columns 

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [36]:
df.isnull().sum().sum()

# Total 866 NULL values found in the dataset

866

#### Replacing NULL entries in the Dataset with the value 5:

In [37]:
df3 = df.fillna(value = 5)
df3

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,5,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,5,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,5,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,5,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,5.0,1,2,W./C. 6607,23.4500,5,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [38]:
df2.isnull().sum().sum()

# We have succesfully imputed the NULL values 

0

## Handling Outliers in Dataset:

#### Using InterQuantile Range (IQR) Method:

In [None]:
# Steps for IQR:

# (1) Sort data in Ascending Order.
# (2) Calculate 1st Quantile(q1) and 3rd Quantile(q3).
# (3) Find IQR Range (q3 - q1)
# (4)Find lower bound q1 * 1.5
# (5)Find upper bound q3 * 1.5

# Anything that lies outside the upper and lower bounds are outliers

In [63]:
# Sorting

sorted_df = df.sort_values('Age', ascending = False).head(10)
print(sorted_df)

     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0     

In [64]:
sorted_column = sorted_df['Age']
print(sorted_column)

630    80.0
851    74.0
493    71.0
96     71.0
116    70.5
672    70.0
745    70.0
33     66.0
54     65.0
280    65.0
Name: Age, dtype: float64


In [67]:
quantile1, quantile3 = np.percentile(sorted_column, [25,75])
print(quantile1, quantile3)

67.0 71.0


In [68]:
# find IQR

iqr_value = quantile3 - quantile1
print(iqr_value)

4.0


In [69]:
# Finding lower and upper bounds

lower_bound_val = quantile1 - (1.5 * iqr_value)
upper_bound_val = quantile3 + (1.5 * iqr_value)

print(lower_bound_val, upper_bound_val)

61.0 77.0


In [71]:
sorted_column = df[df['Age'] > 77]

# Print the filtered rows
print(sorted_column['Age'])

# So Age greater than 77 is a outlier 

630    80.0
Name: Age, dtype: float64


In [82]:
# Searching for age > 77 in the dataset

df.loc[(df['Age'] > 77)]


# We only find 1 tuple where Age of the person is above 77 so this must be the outlier we were looking for

Empty DataFrame
Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
Index: []


In [84]:
# Removing the entry

df = df.drop(df[df['Age'] > 77].index)

##  standardize numerical features


In [86]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
mmx = MinMaxScaler()
std = StandardScaler()
x_numeric = df.select_dtypes(include = np.number)
x_std = std.fit_transform(x_numeric)
x_norm = mmx.fit_transform(x_numeric)

## Encoding

In [90]:
from sklearn.preprocessing import LabelEncoder

x_cat = df.select_dtypes(include = 'object')

n_x_cat = pd.DataFrame()
le = LabelEncoder()

for i in x_cat.columns:
    col = x_cat[i].unique()
    le.fit(col)
    n_x_cat[i]=le.transform(x_cat[i])
