In [239]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

### prepare data

In [240]:
df = pd.read_csv('00Term project materials-20221204/npf_train.csv')
df.head()
df.shape

(464, 104)

In [241]:
npf = df.copy()
class2 = np.array(["noevent","event"])
npf["class2"] = class2[(npf["class4"]!="nonevent").astype(int)]
class_type = np.array([0,1])
npf["class_type"] = class_type[(npf["class4"]!="nonevent").astype(int)]
npf.head()


Unnamed: 0,id,date,class4,partlybad,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,...,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std,class2,class_type
0,1,2000-01-17,Ib,False,368.771711,0.310309,368.665658,0.305127,369.371184,0.333606,...,-0.899393,0.271648,2.492491,1.31088,0.031587,0.018122,0.000243,3.5e-05,event,1
1,2,2000-02-28,nonevent,False,378.197295,1.001493,378.083089,1.025472,378.671311,1.017208,...,2.033191,0.27109,0.295937,0.177836,0.00514,0.003552,0.003658,0.00094,noevent,0
2,3,2000-03-24,Ib,False,373.043158,0.749865,372.930066,0.728411,373.569735,0.83524,...,0.780422,2.334741,14.434789,8.627312,0.353743,0.272472,0.000591,0.000191,event,1
3,4,2000-03-30,II,False,375.643019,0.558629,375.54943,0.540964,376.045849,0.58073,...,6.533544,2.695277,16.077513,9.984686,0.568242,0.45183,0.002493,0.000466,event,1
4,5,2000-04-04,nonevent,False,377.66103,0.408421,377.609576,0.423963,378.117134,0.417243,...,2.133774,1.614122,9.710422,7.054069,0.339135,0.291457,0.004715,0.000679,noevent,0


In [242]:
npf["class2"].value_counts()

event      232
noevent    232
Name: class2, dtype: int64

In [243]:
npf["class_type"].value_counts()

1    232
0    232
Name: class_type, dtype: int64

In [244]:
npf["multiclass"] = np.where(npf["class4"]=="nonevent",0,1)
npf["multiclass"] = np.where(npf["class4"]=="Ia",1,npf["multiclass"])
npf["multiclass"] = np.where(npf["class4"]=="Ib",2,npf["multiclass"])
npf["multiclass"] = np.where(npf["class4"]=="II",3,npf["multiclass"])
npf["multiclass"].value_counts()

0    232
3    113
2     85
1     34
Name: multiclass, dtype: int64

In [245]:
npf.head()

Unnamed: 0,id,date,class4,partlybad,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,...,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std,class2,class_type,multiclass
0,1,2000-01-17,Ib,False,368.771711,0.310309,368.665658,0.305127,369.371184,0.333606,...,0.271648,2.492491,1.31088,0.031587,0.018122,0.000243,3.5e-05,event,1,2
1,2,2000-02-28,nonevent,False,378.197295,1.001493,378.083089,1.025472,378.671311,1.017208,...,0.27109,0.295937,0.177836,0.00514,0.003552,0.003658,0.00094,noevent,0,0
2,3,2000-03-24,Ib,False,373.043158,0.749865,372.930066,0.728411,373.569735,0.83524,...,2.334741,14.434789,8.627312,0.353743,0.272472,0.000591,0.000191,event,1,2
3,4,2000-03-30,II,False,375.643019,0.558629,375.54943,0.540964,376.045849,0.58073,...,2.695277,16.077513,9.984686,0.568242,0.45183,0.002493,0.000466,event,1,3
4,5,2000-04-04,nonevent,False,377.66103,0.408421,377.609576,0.423963,378.117134,0.417243,...,1.614122,9.710422,7.054069,0.339135,0.291457,0.004715,0.000679,noevent,0,0


# Binary classification

In [246]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
X = npf.drop(["id","date","class4","partlybad","class2","class_type","multiclass"],axis=1)
y = npf["class_type"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)
X_train.shape

(324, 100)

In [247]:
X_train.mean(axis=0)
X_train.std(axis=0)

CO2168.mean    11.342924
CO2168.std      3.429428
CO2336.mean    11.324012
CO2336.std      3.143118
CO242.mean     10.899764
                 ...    
UV_A.std        4.941659
UV_B.mean       0.308604
UV_B.std        0.281495
CS.mean         0.001977
CS.std          0.000651
Length: 100, dtype: float64

In [248]:
#without preprocessing data
m1 = LogisticRegression()
m1.fit(X_train,y_train)
predictions = m1.predict(X_test)
#probability of "event" for test data
pre_prob = m1.predict_proba(X_test)[:,1]
print(pre_prob.shape)
# print("probability:",pre_prob)
print("confusion_matrix:",confusion_matrix(y_test,predictions))
#score shows the accuracy of your model (Binary accuracy (class2))
score = m1.score(X_test,y_test)
print("score:",score)
#perplexity  P = exp(−mean(ln(pi)))
perplexity_test = np.exp(-np.mean(np.log(y_test*pre_prob + (1 - y_test) * (1 - pre_prob))))
print("perplexity_test:",perplexity_test)
print("classification_report:",classification_report(y_test,predictions))

(140,)
confusion_matrix: [[61 11]
 [ 7 61]]
score: 0.8714285714285714
perplexity_test: 1.4464358121680827
classification_report:               precision    recall  f1-score   support

           0       0.90      0.85      0.87        72
           1       0.85      0.90      0.87        68

    accuracy                           0.87       140
   macro avg       0.87      0.87      0.87       140
weighted avg       0.87      0.87      0.87       140



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [249]:
#with preprocessing data
import sklearn.preprocessing as preprocessing
#l1,l2,max  3 types normalization
X_trainNorm = preprocessing.normalize(X_train, norm="max")
X_testNorm = preprocessing.normalize(X_test, norm="max")
m2 = LogisticRegression()
m2.fit(X_trainNorm,y_train)
predictions_Norm = m2.predict(X_testNorm)
#probability of "event" for test data
pre_prob_Norm = m2.predict_proba(X_testNorm)[:,1]
# print(pre_prob_Norm)
print("confusion_matrix",confusion_matrix(y_test,predictions_Norm))
#score shows the accuracy of your model (Binary accuracy (class2))
score_Norm = m2.score(X_testNorm,y_test)
print("score:",score_Norm)
#perplexity  P = exp(−mean(ln(pi)))
perplexity_test_Norm = np.exp(-np.mean(np.log(y_test*pre_prob_Norm + (1 - y_test) * (1 - pre_prob_Norm))))
print("perplexity_test:",perplexity_test_Norm)
print("classification_report:",classification_report(y_test,predictions_Norm))


confusion_matrix [[58 14]
 [ 8 60]]
score: 0.8428571428571429
perplexity_test: 1.6213929530741549
classification_report:               precision    recall  f1-score   support

           0       0.88      0.81      0.84        72
           1       0.81      0.88      0.85        68

    accuracy                           0.84       140
   macro avg       0.84      0.84      0.84       140
weighted avg       0.85      0.84      0.84       140



# best answer so far:0.89

In [250]:
#with preprocessing data
import sklearn.preprocessing as preprocessing
#standardization
#The result is that all data for each attribute is clustered around 0 with a variance of 1
# scaler = preprocessing.StandardScaler().fit(X)
# StandardScaler(copy=True, with_mean=True, with_std=True)
# X_trainScale = scaler.transform(X_train)
# X_testScale = scaler.transform(X_test)
X_trainScale = preprocessing.scale(X_train)
X_testScale = preprocessing.scale(X_test)
m3 = LogisticRegression()
m3.fit(X_trainScale,y_train)
predictions_Scale = m3.predict(X_testScale)
#probability of "event" for test data   
pre_prob_Scale = m3.predict_proba(X_testScale)[:,1]
# print(pre_prob_Scale)
print("confusion_matrix",confusion_matrix(y_test,predictions_Scale))
#score shows the accuracy of your model(Binary accuracy (class2))
score_Scale = m3.score(X_testScale,y_test)
print("score:",score_Scale)
#perplexity  P = exp(−mean(ln(pi)))
perplexity_test_Scale = np.exp(-np.mean(np.log(y_test*pre_prob_Scale + (1 - y_test) * (1 - pre_prob_Scale))))
print("perplexity_test:",perplexity_test_Scale)
print("classification_report:",classification_report(y_test,predictions_Scale))


confusion_matrix [[62 10]
 [ 5 63]]
score: 0.8928571428571429
perplexity_test: 1.335464904515172
classification_report:               precision    recall  f1-score   support

           0       0.93      0.86      0.89        72
           1       0.86      0.93      0.89        68

    accuracy                           0.89       140
   macro avg       0.89      0.89      0.89       140
weighted avg       0.90      0.89      0.89       140



### estimate your accuracy in the test set(use m3)

In [251]:

npf_test = pd.read_csv('00Term project materials-20221204/npf_test_hidden.csv')
npf_test.head()

Unnamed: 0,id,date,class4,partlybad,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
0,465,,,False,372.893878,4.629224,373.054467,4.205271,373.651472,5.784621,...,12.41528,1.609407,12.804559,2.312871,15.330203,12.127582,0.564091,0.535718,0.003237,0.00128
1,466,,,False,384.357287,6.189277,384.481755,6.087435,384.976738,6.291519,...,22.072418,3.83535,23.068204,4.320448,21.137412,15.249912,0.986913,0.868741,0.008954,0.001712
2,467,,,False,407.8518,2.338562,406.5674,1.789194,411.091224,2.187063,...,-14.321208,0.795222,-16.395076,1.230319,1.719967,1.056718,0.04333,0.028101,0.002392,0.000673
3,468,,,False,380.007321,0.311889,379.976,0.301242,380.187636,0.332844,...,-5.78429,0.134812,-5.341722,0.144034,0.66826,0.300168,0.013184,0.005966,0.002179,0.000264
4,469,,,False,403.974082,0.369724,404.066122,0.370966,404.077755,0.439589,...,-17.074846,0.35116,-16.364321,0.461174,1.756448,0.663685,0.042135,0.016875,0.001908,9.1e-05


In [252]:
npf_test.shape

(965, 104)

In [253]:
x_t = npf_test.drop(["class4","partlybad","id","date"],axis=1)
x_testdata = preprocessing.scale(x_t)
x_testdata.shape

(965, 100)

In [254]:
y_pred = m3.predict(x_testdata)
y_pred.shape

(965,)

# Multiclass Classification

In [255]:
new_y = npf["multiclass"]
X_train, X_test, new_y_train, new_y_test = train_test_split(X,new_y,test_size=0.3,random_state=101)

(324, 100)

### Need to train 3 classifiers