# Data pre-processing flow

### Null value cleaning and processing 
### Value Implementation
### Feature Engineering
### Data Encoding
### Data Scaling

In [None]:
import sklearn

In [None]:
from sklearn.datasets import load_iris

In [None]:
load_iris()

In [None]:
load_iris(return_X_y=True)

In [None]:
X, y = load_iris(return_X_y=True)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
Model = LinearRegression()

In [None]:
Model.fit(X,y)

Model.predict(X)

In [None]:
import matplotlib.pyplot as plt
pred = Model.predict(X)
plt.scatter(pred, y)

---

# Null Value Processing

In [None]:
import pandas as pd

In [None]:
from sklearn.datasets import fetch_openml

In [None]:
df = fetch_openml('titanic', version =1, as_frame=True)['data']

#### Find the null columns

In [None]:
df.info()

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

In [None]:
import seaborn as sns

In [None]:
sns.set()
miss_value_per = pd.DataFrame((df.isnull().sum()/len(df))*100)
miss_value_per.plot(kind='bar', title='Missing values in percentage', ylabel='percentage')

In [None]:
print(f'size of the dataset: {df.shape}')

#### Gather information about null values and drop columns that have too many null vlaues

In [None]:
df.drop(['body'], axis=1, inplace=True)
print(f"Size of the dataset after dropping a feature: {df.shape}")

---

# Value Imputation

#### SimpleImputator is used to fill in null values with mean median or mode


In [None]:
 from sklearn.impute import SimpleImputer

In [None]:
print(f"Number of null values before imputing: {df.age.isnull().sum()}")

In [None]:
imp = SimpleImputer(strategy='mean')
df['age'] = imp.fit_transform(df[['age']])
print(f"Number of null values after imputing: {df.age.isnull().sum()}")

In [None]:
def get_parameters(df):
    parameters = {}
    for col in df.columns[df.isnull().any()]:
        if df[col].dtype == 'float64' or df[col].dtype == 'int64' or df[col].dtype == 'int32':
            strategy = 'mean'
        else:
            strategy = 'most_frequent' 

        missing_values = df[col][df[col].isnull()].values[0] 
        parameters[col] = {'missing_values':missing_values, 'strategy':strategy}
    return parameters
get_parameters(df)

In [None]:
parameters = get_parameters(df)

In [None]:
for col, param in parameters.items():
    missing_values = param['missing_values']
    strategy = param['strategy']
    imp = SimpleImputer(missing_values = missing_values, strategy= strategy)
    df[col] = imp.fit_transform(df[[col]]).ravel()

In [None]:
df.isnull().sum()

In [None]:
df.head()

---

# Feature Engineering

#### The code below:
1) Creates a new family column
2) Creates the travelled_alone columns
3) Plots how many passengers have travelled alone on the ship



In [None]:
df['family'] = df['sibsp'] + df['parch']
df.loc[df['family']>0, 'travelled_alone'] = 0
df.loc[df['family']==0, 'travelled_alone'] = 1
df['travelled_alone'].value_counts().plot(title='Passenger travelled alone?', kind='bar')

---

# Data Encoding

#### The code below:
1) Selects the sex column from the df as a df
2) Uses OneHotEncoder to encode the unique categories and then transform it into 2D array
3) Assigns 2 new columns in df called male and female

In [None]:
from sklearn.preprocessing import OneHotEncoder
df[['female', 'male']] = OneHotEncoder().fit_transform(df[['sex']]).toarray()
df[['sex', 'female', 'male']]

In [None]:
df['sex'] = OneHotEncoder().fit_transform(df[['sex']]).toarray()[:,1]

In [None]:
df.head() 

# Scaling

## If the data is any conditions has data points far from each other, scaling is a technique to make them close to each other

### MinMaxScaler

For each value in a feature, it subtracts the minimum value in the feature and then divides by the range

### StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
num_cols = df.select_dtypes(include=['int64', 'float64', 'int32']).columns
print(num_cols)

In [None]:
ss = StandardScaler()

In [None]:
df[num_cols] == ss.fit_transform(df[num_cols])
df[num_cols].describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
minmax = MinMaxScaler() 
df[num_cols] = minmax.fit_transform(df[num_cols])
df[num_cols]