## PROJET FINAL: Detecting Wagon Maintenance Trips​

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime, timedelta

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file = '/content/drive/MyDrive/dataset_2.csv'

In [None]:
dataset = pd.read_csv(file)
dataset

In [None]:
dataset.columns

In [None]:
dataset.info()

In [None]:
(dataset.isna().sum()/dataset.shape[0]).sort_values(ascending=True)

In [None]:
dataset = dataset.drop(["Unnamed: 0", "date", "Latitude", "Longitude", "wagon_number"], axis=1)
dataset

In [None]:
index_with_nan = dataset.index[dataset.isnull().any(axis=1)]

In [None]:
dataset.drop(index_with_nan,0, inplace=True)

In [None]:
dataset.head()

## Examen de la colonne target

In [None]:
dataset['target'].value_counts(normalize=True)

In [None]:
# Separate target variable Y from features X
print("Separating labels from features...")
Y = dataset.loc[:,"target"]
X = dataset.drop("target", axis = 1) # Keeping all columns
print("...Done.")
print(Y.head())
print()
print(X.head())

In [None]:
from sklearn.model_selection import train_test_split
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer         
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [None]:
num_imputer = SimpleImputer(strategy="median")            # missing values will be replaced by columns' median
num_imputer.fit_transform(X_train.iloc[:, 1:])

In [None]:
# Create pipeline for numeric features
num_features = [1,2,3,4,5] # Names of numeric columns in X_train/X_test
num_transformer = Pipeline(steps=[
    
    ('scaler', StandardScaler())
])

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num_transformer", num_transformer, num_features)
        
    ])

# Preprocessings on train set
X_train = preprocessor.fit_transform(X_train)
X_train[:5] # Numpy syntax to display 5 first lines




In [None]:
# Encode target variable Y
encoder = LabelEncoder()
Y_train = encoder.fit_transform(Y_train.squeeze())
Y_train

## TRANING PHASE

In [None]:
# Train model
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, Y_train)
classifier.score(X_train, Y_train)

## Predictions 

In [None]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = classifier.predict(X_train)
print("...Done.")
print(Y_train_pred[0:5])
print()

In [None]:
# Preprocessings on test set
X_test = preprocessor.fit_transform(X_test)
X_test[:5]

In [None]:
# Encode target variable Y
encoder = LabelEncoder()
Y_test = encoder.fit_transform(Y_test.squeeze())
Y_test

In [None]:

classifier.score(X_test, Y_test)

In [None]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = classifier.predict(X_test)
print("...Done.")
print(Y_test_pred[0:5])
print()

## Performance Evaluation

In [None]:
# Print scores
print("Accuracy on training set : ", accuracy_score(Y_train, Y_train_pred)) # Always pass true label first, and predictions in second position
print("Accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

In [None]:
def evaluation(model):
    
    model.fit(X_train, Y_train)
    ypred = classifier.predict(X_test)

    print(ypred)
    
    print(confusion_matrix(Y_test, ypred))
    print(classification_report(Y_test, ypred))
    
    N, train_score, val_score = learning_curve(model, X_train, Y_train,
                                              cv=4, scoring='f1',
                                               train_sizes=np.linspace(0.1, 1, 10))
    
    
    plt.figure(figsize=(12, 8))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend()

In [None]:
evaluation(classifier)

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [None]:
preprocessor = make_pipeline(PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif, k=10))

In [None]:
RandomForest = make_pipeline(preprocessor, RandomForestClassifier(random_state=0))
AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
SVM = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))
KNN = make_pipeline(preprocessor, StandardScaler(), KNeighborsClassifier())

In [None]:
dict_of_models = {'RandomForest': RandomForest,
                  'AdaBoost' : AdaBoost,
                  'SVM': SVM,
                  'KNN': KNN
                 }

In [None]:
for name, model in dict_of_models.items():
    print(name)
    evaluation(model)

## OPTIMISATION

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [None]:
SVM

In [None]:
hyper_params = {'svc__gamma':[1e-3, 1e-4, 0.0005],
                'svc__C':[1, 10, 100, 1000, 3000], 
               'pipeline__polynomialfeatures__degree':[2, 3],
               'pipeline__selectkbest__k': range(45, 60)}

In [None]:
grid = RandomizedSearchCV(SVM, hyper_params, scoring='recall', cv=4,
                          n_iter=10)

grid.fit(X_train, Y_train)

print(grid.best_params_)

y_pred = grid.predict(X_test)

print(classification_report(Y_test, y_pred))