## 01 Import used libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [2]:
# required and used Libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

## 02 Used Functions for feature engineering

*   Get dataframes data removing unwanted labels and null data

In [3]:
from scipy.linalg import dft
def getTrainTestdata(df, required_label, removing_labels):

  # removeing null values and not required labels
  df = df.drop(removing_labels, axis =1).dropna()

  return df, df.pop(required_label)

*   Random Forest Classifier used to create the model


In [4]:
def useRandommForestModel(X_train, y_train, X_test):
  # Define model. Specify a number for random_state to ensure same results each run
  model = RandomForestClassifier(random_state=1)

  # Fit model
  model.fit(X_train, y_train)

  # predict values using model
  predict_val = model.predict(X_test)

  return predict_val

*   K-Nearest Neighbor used to create the model

In [5]:
def useKNN(X_train, y_train, X_test, n_neighbors = 5):
  # Create and train your KNN classifier model
  # You can perform hyperparameter tuning by changing the value of 'n_neighbors'
  knn = KNeighborsClassifier(n_neighbors)
  knn.fit(X_train, y_train)
  y_pred = knn.predict(X_test)
  return y_pred

  # create KNN model
  # classifier = KNeighborsClassifier(n_neighbors=5)
  # classifier.fit(X_train, y_train)
  # y_pred = classifier.predict(X_test)
  # return y_pred

*   create a model using support vector machine using **sklearn.svm**

In [6]:
from sklearn.svm import SVC
def useSVM(X_train, y_train, X_test):
  # standardize the data set
  svc = SVC(kernel="linear")

  # create KNN model
  svc.fit(X_train, y_train)
  y_pred = svc.predict(X_test)
  return y_pred

* Get the accuracy and report of the predictions

In [7]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

def getSummary(y_test, predict_val):
    print(confusion_matrix(y_test, predict_val))
    print(f"Accuracy: {accuracy_score(y_test, predict_val)}")
    print(f"F1 Score: {f1_score(y_test, predict_val, average='weighted')}")
    print(f"Precision: {precision_score(y_test, predict_val, average='weighted')}")
    print(f"Recall: {recall_score(y_test, predict_val, average='weighted')}")
    return [f"Accuracy: {accuracy_score(y_test, predict_val)}", f"F1 Score: {f1_score(y_test, predict_val, average='weighted')}",f"Precision: {precision_score(y_test, predict_val, average='weighted')}", f"Recall: {recall_score(y_test, predict_val, average='weighted')}"]
  # # get the mean absolute error of the model
  # MAE = mean_absolute_error(y_test, predict_val)

  # # get the accuracy of the model
  # accuracy = accuracy_score(y_test, predict_val)
  # report = classification_report(y_test, predict_val)
  # return MAE, report,accuracy

*   select best K number of features using sklearn **SelectBest** function which uses **f_regression**

In [8]:
from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import f_regression

def getSelectedFeatures(k, X_train, y_train):
  # define number of features to keep "k"

  # perform feature selection
  X_new = SelectKBest(f_regression, k=k).fit_transform(X_train, y_train)

  # get feature names of selected features
  selected_features = X_train.columns[SelectKBest(f_regression, k=k).fit(X_train, y_train).get_support()]

  return selected_features

*   Add a ***Principal Component Analysis(PCA)*** and get additional features by finding principal components

In [9]:
from sklearn.decomposition import PCA
def apply_pca(n_Components, X_train, X_test, X_test_predict, standardize=True):
    # Standardize
    if standardize:
        X_train = (X_train - X_train.mean(axis=0)) / X_train.std(axis=0)
    # Create principal components
    pca = PCA(n_Components, svd_solver='full')
    pca.fit(X_train)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    X_test_predict_pca = pca.transform(X_test_predict)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_train_pca.shape[1])]
    X_train_pca = pd.DataFrame(X_train_pca, columns=component_names)
    X_test_pca = pd.DataFrame(X_test_pca, columns=component_names)
    X_test_predict_pca = pd.DataFrame(X_test_predict_pca, columns=component_names)
    # Create loadings
    # loadings = pd.DataFrame(
    #     pca.components_.T,  # transpose the matrix of loadings
    #     columns=component_names,  # so the columns are the principal components
    #     index=X_train.columns,  # and the rows are the original features
    # )
    return pca, X_train_pca, X_test_pca, X_test_predict_pca

## 03 Reducing features by feature engineering of Label 01



###Get dataset and remove un wanted labels

In [10]:
train = pd.read_csv("drive/MyDrive/ML_Project/Layer_12_train.csv")
test = pd.read_csv("drive/MyDrive/ML_Project/Layer_12_valid.csv")
test_predict = pd.read_csv("drive/MyDrive/ML_Project/Layer_12_test.csv")


X_train_label1, y_train_label1= getTrainTestdata(train, "label_1",['label_2','label_3','label_4'])
X_test_label1, y_test_label1  = getTrainTestdata(test, "label_1",['label_2','label_3','label_4'])
X_test_predict_label1  = test_predict.drop('ID', axis =1).dropna()

# standardize the data set
scaler = StandardScaler()
scaler.fit(X_train_label1)

component_names = X_train_label1.columns

 # scale the data set using new scale
X_train_label1 = pd.DataFrame(scaler.transform(X_train_label1), columns=component_names)
X_test_label1 =  pd.DataFrame(scaler.transform(X_test_label1), columns=component_names)
X_test_predict_label1 = pd.DataFrame(scaler.transform(X_test_predict_label1), columns=component_names)

X_train_label1.head()

In [11]:
y_train_label1.value_counts().plot(kind='bar', edgecolor='black')

### create a model using Random KNN before feature engineering

In [29]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = useKNN(X_train_label1, y_train_label1, X_test_label1)
Summary = getSummary(y_test_label1, y_pred)

print(classification_report)

### create a model using Random SVC before feature engineering

In [30]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = useSVM(X_train_label1, y_train_label1, X_test_label1)
Summary = getSummary(y_test_label1, y_pred)

print(classification_report)

### Add KNN predicted data since it has the better accuracy to csv file

In [31]:
# from sklearn.metrics import classification_report, confusion_matrix

# predicted_values = dict()

# Before_FE_predict_label = useKNN(X_train_label1, y_train_label1, X_test_predict_label1)
# predicted_values['Predicted labels before feature engineering'] = Before_FE_predict_label

### Find best k features and select

    (before selecting features checked if k is enough to achieve a better accuracy and finally come up with k features is enough)

In [12]:
selected_features = getSelectedFeatures(300, X_train_label1, y_train_label1)

# print selected features
print(selected_features)

### Check the accuracy of the selected features are enogh or not using random forest

In [13]:
X_train_selected_label1 = X_train_label1[list(selected_features)]
X_test_selected_label1 = X_test_label1[list(selected_features)]
X_test_predict_selected_label1 = X_test_predict_label1[list(selected_features)]

### Check the accuracy of the selected features are enogh or not using KNN

In [34]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = useKNN(X_train_selected_label1, y_train_label1, X_test_selected_label1)
Summary = getSummary(y_test_label1, y_pred)

print("using KNN after redusing the features to create model: ")
print(classification_report)

### Check the accuracy of the selected features are enogh or not using SVC

In [35]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = useSVM(X_train_selected_label1, y_train_label1, X_test_selected_label1)
Summary = getSummary(y_test_label1, y_pred)

print("using SVM after redusing the features to create model: ")
print(classification_report)

### Create a PCA to create new features using train data

In [14]:
pca, X_train_pca, X_test_pca, X_test_predict_pca = apply_pca(0.99, X_train_label1, X_test_label1, X_test_predict_label1)
print(X_test_pca.keys())
# print(loadings.to_string())

### Find the status of the model using KNN after joining PCAs

In [37]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = useKNN(X_train_pca, y_train_label1, X_test_pca, 5)
Summary = getSummary(y_test_label1, y_pred)

print("Mean absolute error after adding new pca features of the model Lable_1: ")
print("Classification report: \n", classification_report)

### Find the status of the model using SVC after joining PCAs

In [38]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = useSVM(X_train_pca, y_train_label1, X_test_pca)
Summary = getSummary(y_test_label1, y_pred)

### Highper parameter Tuning

*  Hyper parameter Tuning for SVM modle

In [39]:
from sklearn.model_selection import RandomizedSearchCV
svc_tuning = SVC(class_weight="balanced")
param_dist = {
    'C': [0.1, 10, 15],
    'gamma': [0.001, 0.05, 0.1, 0.5, 1],
    'kernel': ['linear'],
    'degree': [1, 2]
}
random_search = RandomizedSearchCV(
    svc_tuning,
    param_distributions = param_dist, cv=5, n_iter =2, n_jobs =-1)
random_search.fit(X_train_pca, y_train_label1)

In [40]:
predict2 = random_search.predict(X_test_pca)
accuracy_score(y_test_label1, predict2)

In [41]:
print(random_search.best_estimator_.get_params())

In [42]:
from sklearn.model_selection import RandomizedSearchCV as rscv
svc_tuninh2 = SVC(class_weight="balanced")
param_dist1 = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['rbf', 'linear']
}
random_search2 = rscv(
    svc_tuninh2,
    param_distributions=param_dist1,
    n_iter=2, cv=5, n_jobs=-1, verbose=1,
    )
random_search2.fit(X_train_pca, y_train_label1)

In [43]:
predict2 = random_search2.predict(X_test_pca)
accuracy_score(y_test_label1, predict2)

In [44]:
print(random_search2.best_estimator_.get_params())

*  Hyper parameter Tuning for KNN modl


In [15]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
gs = RandomizedSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)

In [16]:
# fit the model on our train set
g_res = gs.fit(X_train_pca, y_train_label1)

In [17]:
# get the hyperparameters with the best score
g_res.best_params_
# find the best score
g_res.best_score_

In [18]:
knn = KNeighborsClassifier(algorithm= 'auto', leaf_size= 30, metric= 'manhattan', metric_params= None, n_jobs= None, n_neighbors= 5, p= 2, weights= 'distance')
knn.fit(X_train_pca, y_train_label1)
y_pred = knn.predict(X_test_pca)
Summary = getSummary(y_test_label1, y_pred)

## Get best prediction

In [33]:
from sklearn.metrics import classification_report, confusion_matrix

knn = KNeighborsClassifier(algorithm= 'auto', leaf_size= 30, metric= 'manhattan', metric_params= None, n_jobs= None, n_neighbors= 7, p= 2, weights= 'distance')

knn.fit(X_train_label1, y_train_label1)
y_pred = knn.predict(X_test_label1)
Summary = getSummary(y_test_label1, y_pred)


y_pred_test = knn.predict(X_test_predict_pca)


[[3 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 1]
 [0 0 0 ... 0 2 0]
 [0 0 0 ... 0 0 3]]
Accuracy: 0.192
F1 Score: 0.17956984151745717
Precision: 0.2965600387824828
Recall: 0.192


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:

# Create DataFrame
df = pd.DataFrame(y_pred_test, columns =["label_1"] )
df.head()
df.to_csv("drive/MyDrive/ML_Project/Project_Layer_12_Label_1", index=False)