In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [None]:
data = pd.read_csv('/content/drive/MyDrive/DA-CaseStudies/Datasets/train.csv')
data.head()

In [6]:
data = pd.DataFrame(data)

**1. Display Top 5 rows of the Dataset**

In [None]:
data.head()

**2. Check last 3 rows of the Dataset**

In [None]:
data.tail(3)

**3. Find Shape of our Dataset (rows and Columns)**

In [None]:
data.shape

**4. Getting Information about our Dataset like Total Number rows, Total number of columns, Datatypes of each columns and Memory requirement**

In [None]:
data.info()

**5. Get Overall Statistics about the DataFrame**

In [None]:
data.describe()

In [None]:
data.describe(include='all')

**6. Data Fittering**   :- Selecting rows as per Condition

In [None]:
data.columns

In [None]:
data['Name']    #  Series

In [None]:
data[['Name','Age','Sex']]    #  DataFrame

In [None]:
data[(data['Sex']=='male') & (data['Age'] >= 24)].head()     #  Masking

In [None]:
# Filtering Survived Individuals

survived = data[data['Survived']==1][['Name','Age','Sex']]
survived

**7. Check Null Values in the Dataset**

In [None]:
data.isnull().sum()

In [None]:
nulls = {
    'nulls' : data.isnull().sum() ,
    'Percentage' : data.isnull().sum()/data.shape[0]*100
}
nulls['Percentage'].sort_values(ascending=False)

In [None]:
# Visualizing null values

sns.heatmap(data.isnull())
plt.show()

**8. Drop the Columns**

In [43]:
# Since Cabin has mostly missing Values

data = data.drop(['Cabin'],axis=1)  #  column

**9. Handle Missing Values**

In [None]:
# 'Embarked'  --- Categorical

data['Embarked'].value_counts()

In [None]:
data['Embarked'].mode()

In [55]:
data['Embarked'] = data['Embarked'].fillna('S')     #  Filling with mode

In [52]:
# from sklearn.impute import SimpleImputer

# col = 'Embarked'
# si = SimpleImputer(strategy='most_frequent')
# data[col] = si.fit_transform(data[[col]])

In [61]:
# 'Age'     --- Numeric

data['Age'] = data['Age'].fillna(data['Age'].median())     #  Filling with median

**10. Categorical Data Encoding**
- Since most machine learning models only accept numerical variables, preprocessing the categorical variables becomes a necessary step. We need to convert these encode categorical variables to numbers such that the model is able to understand and extract valuable information.



In [64]:
# 'Sex'

data['Sex'].unique()   # -- had only 2 unique Categories

data['Gender_encoded'] = data['Sex'].map({'male':1 , 'female':0})   #  New column will be added at the end

In [67]:
# To insert this Encoded columns at a specific position

x = data['Sex'].map({'male':1 , 'female':0})
data.insert(5,'Gender_encoded',x)

If The column has many Unique Categories

In [83]:
# 'Embarked'

data['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [87]:
# pd.get_dummies(data , columns=['Embarked'] )     #  Does 'One hot Encoding'   -- Not good
data1 = pd.get_dummies(data , columns=['Embarked'],dtype=int , drop_first=True)

In [None]:
data1.head()

**11. What is Univariate Analysis?**
- countplot (discrete / categorical variable)
- hist/boxplot(Continuous Variable)
- Pie Chart

- How Many people Survived and How Many Died ?

In [None]:
data['Survived'].value_counts()

In [None]:
# Countplot
sns.countplot(data=data , x='Survived')
plt.show()  #  0 - Died , 1 - Survived

- How Many Passengerrs were in first class , second class, third class?

In [None]:
data['Pclass'].value_counts().sort_index()

In [None]:
# Countplot
sns.countplot(data=data , x='Pclass')
plt.show()

In [None]:
# Pieplot
plt.pie(data['Pclass'].value_counts() , labels=data['Pclass'].unique() , autopct='%.2f%%')
plt.show()

- Number of Male and Female Passengers ?

In [None]:
data['Sex'].value_counts()

In [None]:
# Countplot
sns.countplot(data=data , x='Sex' , palette=['blue','orange'])
plt.show()

In [None]:
# Histplot
sns.histplot(data=data , x='Age' , kde=True , bins=10)
plt.show()

In [None]:
# Boxplot
sns.boxplot(data['Age'] , orient='v')
plt.show()

**12. Bivariate Analysus**

- who has Better Chance of Survival Male or Female ?

In [None]:
pd.crosstab(data['Sex'],data['Survived'] , margins=True)

In [None]:
sns.barplot(data=data , x='Sex' , y='Survived' , palette=['red','blue'])
plt.show()

- Which Passenger class has Better chace of Survial?

In [None]:
pd.crosstab(data['Pclass'],data['Survived'] , margins=True)

In [None]:
sns.barplot(data=data , x='Pclass' , y='Survived' , palette=['red','blue','green'])
plt.show()

**13. Trivariate Analysis**

In [None]:
sns.barplot(data=data , x='Pclass' , y='Survived' , hue='Sex')
plt.show()

**14. Feature Engineering**
- Feature Engineering is the process of creating new features or transforming existing features to improve the performance of a machine-learning model. It involves selecting relevant information from raw data and transforming it into a format that can be easily understood by a model. The goal is to improve model accuracy by providing more meaningful and relevant information.

In [None]:
# Feature Engineering :- It is a process of using domain knowledge to extract 'features' from raw data via data mining techniques so that these features can be used to improve the performance of ML algorithms

In [123]:
# Creating a new Column(feature)

data['Family_size'] = data['SibSp'] + data['Parch'] + 1

In [128]:
# Creating a new Column(feature)

data['Fair_per_Person'] = (data['Fare'] / data['Family_size']).round(2)

In [None]:
data.head(3)