<a href="https://colab.research.google.com/github/Nomad653/Python/blob/main/GridSearch_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import numpy as np

In [39]:
import pandas as pd

In [40]:
from sklearn.linear_model import LogisticRegression

In [41]:
from sklearn.preprocessing import StandardScaler

In [42]:
from sklearn.pipeline import Pipeline

In [43]:
from sklearn.impute import SimpleImputer

In [44]:
from sklearn.preprocessing import OneHotEncoder

In [45]:
from sklearn.compose import ColumnTransformer

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
from scipy.stats import zscore

In [49]:
from sklearn import tree

In [50]:
from sklearn.decomposition import PCA

In [51]:
from sklearn.model_selection import GridSearchCV

In [52]:
data = pd.read_csv("/content/drive/MyDrive/Machine Learning/Titanic competition/train.csv")

In [53]:
new_data = data.drop(columns=["HomePlanet","Name","PassengerId"])

In [54]:
new_data['Total_billed'] = new_data.RoomService + new_data.FoodCourt + new_data.ShoppingMall +new_data.Spa +new_data.VRDeck

In [55]:
conditions = [
              (new_data['Age']<2),
              (new_data['Age']<=15), #Child
              (new_data['Age']>15) & (new_data['Age']<=18), #Teenager
              (new_data['Age']>18) & (new_data['Age']<=35),#Young adult
              (new_data['Age']>35) & (new_data['Age']<=50), #Mid-age
              (new_data['Age']>50) & (new_data['Age']<=60),#Old
              (new_data['Age']>60)


]
values = ['Infant','Child','Teenager','Young adult','Mid-age','Old','Elderly']

In [56]:
new_data['Age_group'] = np.select(conditions,values)

In [57]:
new_data['Cabin'].value_counts()

G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: Cabin, Length: 6560, dtype: int64

In [58]:
#new_data['Family'] = np.where(((new_data['Cabin'].value_counts())>1),True,False)

In [59]:
X  =new_data.drop("Transported",axis=1)

y = new_data["Transported"]

In [60]:
num=X.select_dtypes(include="number").columns
cat=X.select_dtypes(include="object").columns

In [61]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [62]:
num_pipeline = Pipeline(steps=[("num_imputer", SimpleImputer(strategy="most_frequent",missing_values=np.nan)),
                               ("scaler",StandardScaler())
                        ])

cat_pipeline = Pipeline(steps=[
                               ('imputer',SimpleImputer(strategy="most_frequent",missing_values=np.nan)),
                               ('onehot',OneHotEncoder(handle_unknown="ignore",sparse=False))
])

In [63]:
preprocessor = ColumnTransformer([("numerical_pipeline",num_pipeline,num),
                                  ("categorical_pipeline",cat_pipeline,cat)
                                  ])

In [64]:
pipe = Pipeline(steps=[
                       ("preprocessor",preprocessor),
                       ('pca', PCA(n_components=14)),
                       
                       ("clf",LogisticRegression(max_iter=300))
                       
                       

])

In [65]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'pca', 'clf', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__numerical_pipeline', 'preprocessor__categorical_pipeline', 'preprocessor__numerical_pipeline__memory', 'preprocessor__numerical_pipeline__steps', 'preprocessor__numerical_pipeline__verbose', 'preprocessor__numerical_pipeline__num_imputer', 'preprocessor__numerical_pipeline__scaler', 'preprocessor__numerical_pipeline__num_imputer__add_indicator', 'preprocessor__numerical_pipeline__num_imputer__copy', 'preprocessor__numerical_pipeline__num_imputer__fill_value', 'preprocessor__numerical_pipeline__num_imputer__missing_values', 'preprocessor__numerical_pipeline__num_imputer__strategy', 'preprocessor__numerical_pipeline__num_imputer__verbose', 'preprocessor__numerical_pipeline__scaler__copy'

In [94]:
param_grid = [
              {
                  
                  "preprocessor__numerical_pipeline__num_imputer__strategy":["mean","most_frequent"],
                  "clf__C": [0.1, 1.0, 10.0, 100.0],
                  "pca__n_components": [14,15,16],
                  "clf": [LogisticRegression()]
              },
              {
                  
                  "preprocessor__numerical_pipeline__num_imputer__strategy":["mean","most_frequent"],
                  "clf__n_estimators":[2,4,6,8,10,12,14],
                  "pca__n_components": [14,15,16],
                  "clf": [RandomForestClassifier()]
              },
              {
                  "preprocessor__numerical_pipeline__num_imputer__strategy":["mean","most_frequent"],
                  "pca__n_components": [14,15,16],
                  "clf":[tree.DecisionTreeClassifier()]
                  
                  
              }
]

In [96]:
grid_search = GridSearchCV(pipe, param_grid, cv=2,scoring='accuracy', verbose=10,n_jobs=-1)
grid_search.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__preprocessor', 'estimator__pca', 'estimator__clf', 'estimator__preprocessor__n_jobs', 'estimator__preprocessor__remainder', 'estimator__preprocessor__sparse_threshold', 'estimator__preprocessor__transformer_weights', 'estimator__preprocessor__transformers', 'estimator__preprocessor__verbose', 'estimator__preprocessor__verbose_feature_names_out', 'estimator__preprocessor__numerical_pipeline', 'estimator__preprocessor__categorical_pipeline', 'estimator__preprocessor__numerical_pipeline__memory', 'estimator__preprocessor__numerical_pipeline__steps', 'estimator__preprocessor__numerical_pipeline__verbose', 'estimator__preprocessor__numerical_pipeline__num_imputer', 'estimator__preprocessor__numerical_pipeline__scaler', 'estimator__preprocessor__numerical_pipeline__num_imputer__add_indicator', 'estimator__preprocessor__numerical_pipeline__num_imputer__copy', 'estimator__preprocessor__nu

In [None]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 456 candidates, totalling 4560 fits


In [98]:
y_pred = grid_search.predict(X_test)

In [99]:
grid_search.best_params_

{'clf': LogisticRegression(C=0.1),
 'clf__C': 0.1,
 'pca__n_components': 15,
 'preprocessor__numerical_pipeline__num_imputer__strategy': 'mean'}

In [100]:
from sklearn.metrics import accuracy_score

In [101]:
acc1 = accuracy_score(y_test,y_pred)

In [102]:
acc1

0.7757331799884991