In [None]:
# imports

import os
import pandas as pd
import numpy as np
import pandas_profiling as pp
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [None]:
# get_data combines all the data present in seperate folders inside the main folder as a single pandas DataFrame
#Required parameter : Main folder location

def get_data(folder_location):
    
    def file_to_df(location):
        df = pd.read_csv(location, skiprows=4, low_memory=False )
        df["Time"] = df["# Columns: time"]
        df.drop("# Columns: time", axis =1 , inplace = True)
        return df

    all_datas = []
    pathss=[]
    os.chdir(folder_location)
    for i in os.listdir():
        path = str(os.getcwd())+"\\"+str(i)
        pathss.append(path)
    
    for path in pathss[1:]:
        
        try:
            os.chdir(path)
        except:
            pass
        for file in os.listdir():
            try:
                location = str(os.getcwd()+"\\"+str(file))
                df = pd.read_csv(location, skiprows=4 )
                df["Time"] = df["# Columns: time"]
                df.drop("# Columns: time", axis =1 , inplace = True)
                df["Label"]= location.split(sep="\\")[-2]
                
                
                all_datas.append(df)
            except:
                print("could not read the file in this location : " + location)
    
    data = pd.concat(all_datas)
            
    return all_datas , data
            
        
        

In [None]:
List_of_data , df  = get_data("C:\\Users\\RISHI\\Desktop\\AReM")
df

In [None]:
pp.ProfileReport(df).to_widgets()

In [None]:
#### The returned data has read a invalid CSV file and hence resulting in null values
#### Dropping duplicate values

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [None]:
# Columns starting with "var" means the variance according to the dataset description
# They contain zero values. Since variance cannot be zero, it is replaced with mean or median

df["var_rss12"].replace(0, df["var_rss12"].mean(), inplace=True)
df["var_rss13"].replace(0, df["var_rss13"].mean(), inplace=True)
df["var_rss23"].replace(0, df["var_rss23"].mean(), inplace=True)



In [None]:
#Box Plot for visualizing  outliers
fig ,ax  = plt.subplots(figsize = (10,10))
sns.boxplot(data = df.drop("Time",axis=1) , ax = ax)

In [None]:
#copying the data for prevention
df2 = df.copy()

In [None]:
#copying the data for prevention
df2 = df.copy()
def drop_outliers(data,column,threshold):
    
    q3 = np.percentile(sorted(data[column]), 75)
    q1 = np.percentile(sorted(data[column]), 25)
    
    IQR = q3 - q1
    
    ul = q3+threshold*IQR
    ll = q1-threshold*IQR
    
    
    return data[((data[column]<ul) & (data[column]>ll))]
    
    
#df2 = drop_outliers(df2,"avg_rss13",1.6)
#df2 = drop_outliers(df2,"avg_rss23",1.5)
#df2 = drop_outliers(df2,"avg_rss12",1.5) 
#df2 = drop_outliers(df2,"var_rss12",0.5)
#df2 = drop_outliers(df2,"var_rss13",1.4)
#df2 = drop_outliers(df2,"var_rss23",1.6)


In [None]:
#Target and Feauture

x = df2.drop("Label", axis=1)
y = df2["Label"]

In [None]:
#Scaling the data

scalar = StandardScaler()
x_scaled = scalar.fit_transform(x.copy())

#Encoding Target Category

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(list(df2.Label.unique()))
y = encoder.transform(y.copy())
encoder.inverse_transform([1,2,3,4,5,6])

In [None]:
#Train and Test split
x_train , x_test, y_train, y_test = train_test_split(x_scaled,y, random_state=42)

In [None]:
#Logistic Regression
logreg = LogisticRegression()
logreg.fit(x_train,y_train)

In [None]:
#Returns the heatmap of confusin matrix and the Report of the algorithm

def performance(x_train,x_test, y_train, y_test, model):
    
    predicted = model.predict(x_test)
    cm = confusion_matrix(y_test,predicted)

    report = classification_report(y_test, predicted,output_dict=True )
    
    report_values =[]
    for i in report.items():
        report_values.append(i)
        
    return cm, report_values
    

In [None]:
model_without_outliers = performance(x_train,x_test,y_train,y_test,logreg)

In [None]:
# when comparing the model performance with and without the outliers, the model with the outliers present performs better.
model_with_outliers = performance(x_train,x_test,y_train,y_test,logreg)

In [None]:
#Hyperparameter Tuning 


param_grid = [{"penalty" : ['l1', 'l2', 'elasticnet'],
               "dual" : [True, False],
               "tol" : [0.0001,0.0002,0.0003],
               "C" :[1.0, 1.5, 0.5],
               "intercept_scaling" : [1,2],
               "solver" : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
               "max_iter" : [100,150,200],
               "multi_class" : ['auto', 'ovr', 'multinomial'],
               "warm_start" : [True,False],
               "n_jobs" : [-1]}]

def fine_tuning(param_grid,estimator,n_iter):
    
    rdm_search = RandomizedSearchCV(estimator,param_distributions=param_grid,n_jobs=-1,n_iter=n_iter,cv=10)
    
    rdm_search.fit(x_scaled,y)
    best_param = rdm_search.best_params_
    best_score = rdm_search.best_score_
    
    return best_param , best_score
    
               
estimator = LogisticRegression()
fine_tuning(param_grid, estimator, 40)      

#The Hypertuned model performs similar to the model with default parameters

In [None]:
#Performance of the fine tuned model

tuned_model = LogisticRegression(warm_start= True,
  tol= 0.0003,
  solver= 'saga',
  penalty= 'l1',
  n_jobs= -1,
  multi_class= 'auto',
  max_iter= 100,
  intercept_scaling= 1,
  dual= False,
  C= 0.5,)

tuned_model.fit(x_train,y_train)
tuned_performance = performance(x_train,x_test,y_train,y_test,tuned_model)


In [None]:
#Saving the model as a pickle file
file = "Logistic_model.sav"
pickle.dump(tuned_model,open(file,"wb"))

In [None]:
# Predicting the data using the saved model
saved_model = pickle.load(open(file,"rb"))
model_prediction = saved_model.predict(x_test)

model_prediction = pd.DataFrame(encoder.inverse_transform(model_prediction), columns=["Predictions"])
model_prediction

In [None]:
# Cross Validation 

skfold = StratifiedKFold(n_splits=5)
scores = cross_val_score(tuned_model, x_scaled, y, cv=skfold)
print(scores)

#The accuracy is ranging from a minimum of 57% and a maximum of 65%

#### The final model is saved in the current working directory as "Logistic_model.sav". The model predicts the data with a minimum of 57% accuracy and a maximum of 65% accuracy.

In [None]:
from sklearn.tree import DecisionTreeClassifier
dc= DecisionTreeClassifier()
dc.fit(x_train,y_train)

dc_performance = performance(x_train,x_test,y_train,y_test,dc)
dc_performance


In [None]:
skfold = StratifiedKFold(n_splits=5)
scores = cross_val_score(dc, x_scaled, y, cv=skfold)
print(scores)
