In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,roc_curve,roc_auc_score
import matplotlib.pyplot as plt
import numpy as np

In [None]:
ms_df=pd.read_csv('Microsoft_Data.csv')
y=ms_df['HasDetections']

ms_df.drop(columns=['HasDetections','MachineIdentifier'],inplace=True)

mainfo_df=pd.DataFrame({'col_name':ms_df.columns,'na_cnt':ms_df.isnull().sum(),'pc_cnt':(ms_df.isnull().sum()/ms_df.shape[0])*100})


col_gt50_ls=list(mainfo_df[mainfo_df['pc_cnt']>50]['col_name'])

ms_df.drop(columns=col_gt50_ls,inplace=True)

x_train,x_test,y_train,y_test=train_test_split(ms_df,y,random_state=99,test_size=0.2)

con_cols=[col for col in ms_df.columns if ms_df[col].dtype=='int64' or ms_df[col].dtype=='float64']
cat_cols=[col for col in ms_df.columns if ms_df[col].dtype=='object']

In [None]:
for col in con_cols:
    x_train[col].fillna(x_train[col].mean(),inplace=True)
    x_test[col].fillna(x_train[col].mean(),inplace=True)
    
for col in cat_cols:
    x_train[col].fillna(x_train[col].mode()[0],inplace=True)
    x_test[col].fillna(x_train[col].mode()[0],inplace=True)

In [None]:
scaler=StandardScaler()

for col in con_cols:
    x_train[col]=scaler.fit_transform(np.array(x_train[col]).reshape(-1,1))
    x_test[col]=scaler.transform(np.array(x_test[col]).reshape(-1,1))

In [None]:
cat_encd_train=pd.get_dummies(x_train[cat_cols])
cat_encd_test=pd.get_dummies(x_test[cat_cols])

In [None]:
cat_encd_train_final,cat_encd_test_final=cat_encd_train.align(cat_encd_test,join='inner',axis=1)
cat_encd_test_final###aligning train & test data one hot encoded catg columns due to unqual no of columns i.e no of cilumns would differ for that we align to get same

In [None]:
x_train_final=pd.concat([x_train[con_cols],cat_encd_train_final],axis=1)

In [None]:
x_test_final=pd.concat([x_test[con_cols],cat_encd_test_final],axis=1)

In [None]:
logreg=LogisticRegression()
logreg.fit(x_train_final,y_train)

In [None]:
y_test_pred=logreg.predict(x_test_final)

In [None]:
recall_score(y_test,y_test_pred)

In [None]:
f1_score(y_test,y_test_pred)

In [None]:
pos_probs=logreg.predict_proba(x_test_final)[::,1]
fpr,tpr,thresold=roc_curve(y_test,pos_probs)
plt.plot(fpr,tpr)
plt.xlabel('False Postive Rate')
plt.ylabel('True Postive Rate')
plt.title('AUC Curve')


In [None]:
roc_auc_score(y_test,pos_probs)

In [None]:
dtree=DecisionTreeClassifier()## Max_depth,min_samples_split,..e.tc all these are Hyperparameters
#can be found using Hyperparamter tuning 
dtree.fit(x_train_final,y_train)

In [None]:
dtree_test_pred=dtree.predict(x_test_final)

In [None]:
confusion_matrix(y_test,dtree_test_pred)

In [None]:
precision_score(y_test,dtree_test_pred)

In [None]:
recall_score(y_test,dtree_test_pred)

In [None]:
f1_score(y_test,dtree_test_pred)

In [None]:
dtree_pos_probs=dtree.predict_proba(x_test_final)[::,1]
dtree_pos_probs

In [None]:
dtree_fpr,dtree_tpr,dtree_thresold=roc_curve(y_test,dtree_pos_probs)
dtree_fpr

In [None]:
plt.plot(dtree_fpr,dtree_tpr)

In [None]:
logreg.score(x_train_final,y_train)  #train score (r2 score/coefficient of determination)



In [None]:
logreg.score(x_test_final,y_test)      #test score (r2 score/coefficient of determination)

In [None]:
from sklearn.linear_model import Lasso
ls_reg=Lasso()
ls_reg=Lasso(alpha=1)
ls_reg.fit(x_train_final,y_train)
ls_reg.coef_
ls_reg.coef_==0
len(ls_reg.coef_==0)
ls_reg.coef_[ls_reg.coef_==0]
len(ls_reg.coef_[ls_reg.coef_==0])
ls_reg.score(x_train_final,y_train)
ls_reg.score(x_test_final,y_test)

In [None]:
from sklearn.linear_model import Ridge
lr_rid=Ridge()
lr_rid=Ridge(alpha=2)

lr_rid.fit(x_train_final,y_train)
lr_rid.coef_
len(lr_rid.coef_)
lr_rid.coef_==0
len(lr_rid.coef_==0)
lr_rid.coef_[lr_rid.coef_==0]
len(lr_rid.coef_[lr_rid.coef_==0])
lr_rid.score(x_train_final,y_train)
lr_rid.score(x_test_final,y_test)

In [None]:
from sklearn.linear_model import SGDRegressor
sgdrg=SGDRegressor(verbose=1,penalty='l1')
sgdrg.fit(x_train_final,y_train)
sgdrg.pred=sgdrg.predict(x_test_final)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier()
ada.fit(x_train_final,y_train)
ada.predict(x_train_final)
ada.predict(x_test_final)
ada.predict_log_proba(x_test_final)
ada.predict_proba(x_test_final)


In [None]:
ada.score(x_train_final,y_train)
ada.score(x_test_final,y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc= DecisionTreeClassifier()
dtc.fit(x_train_final,y_train)
dtc.predict(x_test_final)

# Hyperparameter Tuning for DecisionTreeClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

dtc=DecisionTreeClassifier()

hyp_dict={ 
           'max_depth':[5,6,7,8],
           'min_samples_split':[4,5,7,8],
           'max_leaf_nodes':[5,7,8,9],
           'min_samples_leaf':[2,3,4,6] 
         }
gcv=GridSearchCV(estimator=dtc,param_grid=hyp_dict,cv=5,verbose=21)
gcv.fit(x_train_final,y_train)

In [None]:
gcv.best_params_
gcv.predict(x_test_final)
dtc_hp_pred=gcv.predict(x_test_final)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rcv=RandomizedSearchCV(estimator=dtc,param_distributions=hyp_dict,cv=5,n_iter=120,verbose=21)
rcv.fit(x_train_final,y_train)

In [None]:
rcv.best_params_
rcv.predict(x_test_final)
dtc_hp_pred=rcv.predict(x_test_final)