<a href="https://colab.research.google.com/github/SirineMaaroufi/ML-Competition/blob/main/Solution_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Competition Aug22 By Business & AI
### Author: Sirine Maàroufi

## Importing important libraries

In [None]:
pip install feature_engine



In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn import metrics
import math
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from feature_engine import transformation as vt
from feature_engine.imputation import DropMissingData, MeanMedianImputer, CategoricalImputer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SmartCorrelatedSelection,DropConstantFeatures,DropDuplicateFeatures
from sklearn.preprocessing import PolynomialFeatures,  StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import  Pipeline
from sklearn.linear_model import LinearRegression , Ridge , RANSACRegressor, Lasso, ElasticNet , HuberRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from feature_engine.outliers import OutlierTrimmer
from sklearn.ensemble import  RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings("ignore")

<a id='wrangling'></a>
## Data Wrangling
### General Properties

In [None]:
# Importing the training data
df = pd.read_csv('Xy_train.csv', sep = ';')

In [None]:
# Getting the shape of our data
df.shape

(45000, 28)

In [None]:
# Getting infos about our data: data types, missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 28 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   _id      45000 non-null  object 
 1   categ_0  45000 non-null  int64  
 2   txt_0    45000 non-null  object 
 3   categ_1  45000 non-null  int64  
 4   num_0    45000 non-null  int64  
 5   categ_2  45000 non-null  int64  
 6   categ_4  45000 non-null  int64  
 7   categ_5  45000 non-null  int64  
 8   num_5    12584 non-null  float64
 9   categ_8  45000 non-null  int64  
 10  num_3    14534 non-null  float64
 11  num_8    9241 non-null   float64
 12  num_1    20797 non-null  object 
 13  num_9    3261 non-null   float64
 14  txt_2    45000 non-null  object 
 15  date_0   45000 non-null  object 
 16  categ_9  45000 non-null  int64  
 17  num_10   43464 non-null  float64
 18  categ_6  45000 non-null  int64  
 19  categ_7  45000 non-null  int64  
 20  txt_3    45000 non-null  object 
 21  num_4    397

In [None]:
df.isnull().sum()

_id            0
categ_0        0
txt_0          0
categ_1        0
num_0          0
categ_2        0
categ_4        0
categ_5        0
num_5      32416
categ_8        0
num_3      30466
num_8      35759
num_1      24203
num_9      41739
txt_2          0
date_0         0
categ_9        0
num_10      1536
categ_6        0
categ_7        0
txt_3          0
num_4       5276
categ_3        0
txt_1          0
num_6      43318
num_2      35674
num_7      42688
y              1
dtype: int64

In [None]:
df.duplicated().sum()

0

### Data preparation

In [None]:
# Data Imputation
median_imputer = MeanMedianImputer(
                   imputation_method='median')
median_imputer.fit(df)
df= median_imputer.transform(df)

# Replace non numeric values in num_1 with None
df.num_1 = df.num_1.replace(['- ch'], [None]).replace(['- kw (- ch)'], [None])

# Impute num_1
imputer = CategoricalImputer(variables=['num_1'],imputation_method='frequent')
imputer.fit(df)
df= imputer.transform(df)

# Correct datatypes
df.num_1 = df.num_1.astype('int64')
df.date_0 = pd.to_datetime(df['date_0'])

# Remove outliers
outlier_remover = OutlierTrimmer(capping_method='iqr')
#outlier_remover.fit(df)
#df = outlier_remover.transform(df)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 28 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   _id      45000 non-null  object        
 1   categ_0  45000 non-null  int64         
 2   txt_0    45000 non-null  object        
 3   categ_1  45000 non-null  int64         
 4   num_0    45000 non-null  int64         
 5   categ_2  45000 non-null  int64         
 6   categ_4  45000 non-null  int64         
 7   categ_5  45000 non-null  int64         
 8   num_5    45000 non-null  float64       
 9   categ_8  45000 non-null  int64         
 10  num_3    45000 non-null  float64       
 11  num_8    45000 non-null  float64       
 12  num_1    45000 non-null  int64         
 13  num_9    45000 non-null  float64       
 14  txt_2    45000 non-null  object        
 15  date_0   45000 non-null  datetime64[ns]
 16  categ_9  45000 non-null  int64         
 17  num_10   45000 non-null  float6

In [None]:
# Choose target and features
y = df.y

X = df.drop(['y','_id'],axis=1)

# Break off Validation set from training data
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=100)

### Preprocessing Pipeline

In [None]:
# Preprocessing for numerical data
#from feature_engine.creation import MathFeatures, RelativeFeatures,

numerical_ix = X.select_dtypes(include=['int64', 'float64']).columns

numerical_transformer = Pipeline(steps=[
    ('power', vt.PowerTransformer(variables = None, exp=0.5)),
    ('scaler', StandardScaler())

])

# Preprocessing for text data
text_transformer = Pipeline(steps=[
    ('tfidf',TfidfVectorizer(max_df=0.8, max_features=2000, min_df=0.01, sublinear_tf=True,
                             use_idf=True, ngram_range=(1,1)))
  ])

# Preprocessing for Datetime Variable
date_transformer= DatetimeFeatures(
    variables=["date_0"],
    features_to_extract=["month", "year"]
)

# Define data transformer
preprocessor = ColumnTransformer(
    transformers=[
          # Extract features from Datetime variable
          ('date',date_transformer ,['date_0']),

          # Extract features from text variables
          ('text_0', text_transformer, 'txt_0'),
          ('text_1', text_transformer, 'txt_1'),
          ('text_2', text_transformer, 'txt_2'),
         #('text_3', text_transformer, 'txt_3'),  #text_3 gets dropped in preprocess_pipe

          # Preprocess numerical variables
          ('num',numerical_transformer, make_column_selector(dtype_include=['int64','float64']))

])

# Preprocessing Pipeline
preprocess_pipe = Pipeline(steps=[
       # Drop irrelevent columns
       ('drop_constant_values', DropConstantFeatures()),
       ('drop_duplicates', DropDuplicateFeatures()),
       # Preprocessing
       ('prep',preprocessor)

])


### Feature Engineering Pipeline

In [None]:
FE_pipe = Pipeline(steps=[
       # Create Polynomial features
       ('poly', PolynomialFeatures(degree= 2, include_bias=False))
])


### ML Pipeline

In [None]:
# Regressor

regressor = RandomForestRegressor()        # 0.92 /0.948/0.949/0.956/0.964  / 0.934

# Wrapped model
wrapped_model = TransformedTargetRegressor(regressor= regressor, transformer = StandardScaler())

# ML pipeline
ML_pipe = Pipeline(steps=[

        # Reduce Dimensionality
        ('TruncatedSVD',TruncatedSVD(algorithm='randomized',n_components=100,
                                     n_iter=7, random_state=42)),

        # Select features
        ('selector', SmartCorrelatedSelection(   variables=None, method="pearson",
                                                threshold=0.8,missing_values="ignore",
                                                selection_method="model_performance",
                                               scoring='r2', estimator=RandomForestRegressor(),
                                           )),
        # Model
        ('model', wrapped_model)
])

# Pipeline
pipe = Pipeline([

    # Step 1: Preprocessing
    ('Preprocessing', preprocess_pipe),
    # Step 2: Create polynomial features
    ('Feature_Engineering', FE_pipe),
    # Model
    ('ML', ML_pipe)

])

print(regressor)
# Preprocessing of training data, fit model
pipe.fit(X_train, y_train)

# Preprocessing of Validation data, get predictions
y_pred_val = pipe.predict(X_val)

# Performance evaluation

# Mean Absolute Error
print("\tMean absolute error:", mean_absolute_error(y_val, y_pred_val ))

# R Squared & Adjusted R Squared
r2 = r2_score(y_val, y_pred_val)
adj_r2 = 1 - (1-r2)*(len(X_train) - 1) / (len(X_train) - (X_train.shape[1] - 1) - 1)
print("\tR2 score:", r2)
print("\tAdjusted R2 score:", adj_r2)

# Root Mean Squared Error
mse = mean_squared_error(y_val, y_pred_val)
rmse = math.sqrt(mse)
print('\tRMSE: %f' % rmse)

RandomForestRegressor()
	Mean absolute error: 1766.070626827289
	R2 score: 0.9001814489697529
	Adjusted R2 score: 0.900112080432038
	RMSE: 3101.065949


In [14]:
from sklearn import set_config
set_config(display='diagram')
display(pipe)

In [None]:
from sklearn.model_selection import GridSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the param grid
parameters = {'ML__model__regressor__n_estimators': n_estimators,
               'ML__model__regressor__max_features': max_features,
               'ML__model__regressor__max_depth': max_depth,
               'ML__model__regressor__min_samples_split': min_samples_split,
               'ML__model__regressor__min_samples_leaf': min_samples_leaf,
               'ML__model__regressor__bootstrap': bootstrap}

estimator = GridSearchCV(pipe, param_grid = parameters, cv = 2, n_jobs=-1)
estimator.fit(X_train, y_train)
print(estimator.score(X_val,y_val))
print(estimator.best_params_,'\n')
print(estimator.best_estimator_,'\n')

# --> best estimator =

### Predicting

In [15]:
# Importing test data
X_test_all= pd.read_table('X_test.csv', sep=';')
X_test = X_test_all.drop(['_id'],axis=1)

# Test data preparation

# Imputation
median_imputer = MeanMedianImputer(
                   imputation_method='median')
median_imputer.fit(X_test)
X_test = median_imputer.transform(X_test)

# Replace non numeric values in num_1 with None
X_test.num_1 = X_test.num_1.replace(['- ch'], [None]).replace(['- kw (- ch)'], [None])

# Impute num_1
imputer = CategoricalImputer(variables=['num_1'],imputation_method='frequent')
imputer.fit(X_test)
X_test= imputer.transform(X_test)

# Correct datatypes
X_test.num_1 = X_test.num_1.astype('int64')
X_test.date_0 = pd.to_datetime(X_test['date_0'])

# Remove outliers
#outlier_remover = OutlierTrimmer(capping_method='iqr')
#outlier_remover.fit(X_test)
#X_test =outlier_remover.transform(X_test)

In [16]:
# Preprocessing of testing data, get predictions
y_pred_test = pipe.predict(X_test)

In [17]:
id=X_test_all['_id']
y_pred_test= pd.Series(y_pred_test)
y_pred = pd.concat([id.reset_index(drop=True), y_pred_test.reset_index(drop=True)], axis=1)
y_pred = y_pred.rename(columns={0:"y_pred"})
y_pred

Unnamed: 0,_id,y_pred
0,6251210db22e614680cef17b,3081.580000
1,609a41b746383f48ea7ed9c4,16703.070000
2,609b46c346383f48ea85a360,11372.790000
3,609a42ee46383f48ea7ef087,7841.750000
4,609a42cd46383f48ea7eed93,15693.050000
...,...,...
14995,60c24c9b13534a25ba2a0db5,15903.017500
14996,60f9d5d134503562bdb88291,29292.536333
14997,624a3221792dc740f97d0aad,10777.350000
14998,624a54d7792dc740f97e2ccc,12237.670000


In [18]:
y_pred.to_csv('y_pred.csv',sep = ';',columns=['_id','y_pred'],index = False)

In [None]:
# Performance evaluation

# Mean Absolute Error
print("\tMean absolute error:", mean_absolute_error(y_test, y_pred_test))

# R Squared & Adjusted R Squared
r2 = r2_score(y_test, y_pred_test)
adj_r2 = 1 - (1-r2)*(len(X_train) - 1) / (len(X_train) - (X_train.shape[1] - 1) - 1)
print("\tR2 score:", r2)
print("\tAdjusted R2 score:", adj_r2)

# Root Mean Squared Error
mse = mean_squared_error(y_test, y_pred_test)
rmse = math.sqrt(mse)
print('\tRMSE: %f' % rmse)

In [None]:
# Preprocessing of testing data, get predictions with GridSearch best estimator
y_pred_test_grid = estimator.best_estimator_.predict(X_test)