In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
%matplotlib inline

In [2]:
train_data = pd.read_csv("train.csv", parse_dates = ["Datetime"])
test_data = pd.read_csv("test.csv", parse_dates = ["Datetime"])
samplesubmission = pd.read_csv("SampleSubmission.csv")
train_data.head()

Unnamed: 0,ID,Datetime,Sensor1_PM2.5,Sensor2_PM2.5,Temperature,Relative_Humidity,Offset_fault
0,ID_QF0ZTQJ2SF5Q,2021-11-03 04:06:31,52.58,49.52,17.4,96.0,0
1,ID_4GTK689CNX5S,2021-11-08 18:43:23,35.25,33.4,25.0,75.0,0
2,ID_DL7VVKW9U7XQ,2021-11-07 09:50:33,19.18,23.5,24.9,75.0,0
3,ID_6XQOMBXM2DG3,2022-01-01 18:55:15,19.4,15.48,24.9,70.0,0
4,ID_UQZW9ISJY9QE,2021-11-05 22:23:48,38.3,34.77,20.9,89.0,0


In [3]:
test_data.head()

Unnamed: 0,ID,Datetime,Sensor1_PM2.5,Sensor2_PM2.5,Temperature,Relative_Humidity
0,ID_STL3V9B2OU12,2022-01-28 08:49:41,90.75,114.78,22.2,56.0
1,ID_U2EKAY4SGA94,2022-02-16 21:30:17,56.03,52.18,,
2,ID_UJFLEWQR0U4N,2022-01-24 17:57:18,387.35,346.73,28.1,63.0
3,ID_S2C2GPGA4R5I,2022-01-31 07:07:09,63.17,83.85,21.6,82.0
4,ID_5V2C2ZL221H8,2022-01-31 00:37:05,69.23,53.07,22.7,80.0


In [None]:
num_cols = ["Sensor1_PM2.5", "Sensor2_PM2.5", "Temperature", "Relative_Humidity"]
g = sns.pairplot(train_data[num_cols], kind = 'kde')
g.map_lower(sns.kdeplot, levels=4, color=".2")
plt.show()

In [None]:
samplesubmission.head()

In [None]:
print("train_data shape: " + str(train_data.shape))
print("test_data shape: " + str(test_data.shape))
print("samplesubmission shape: " + str(samplesubmission.shape))

In [None]:
train_data.describe(include = "all")

In [None]:
train_data.isnull().sum().any(), test_data.isnull().sum().any()

In [None]:
train_data.fillna(0)

In [None]:
train_data.duplicated().any(), test_data.duplicated().any()

In [None]:
train_data.columns

In [None]:
train_data['Datetime_day'] = train_data.Datetime.dt.day
train_data['Datetime_month'] = train_data.Datetime.dt.month
train_data['Datetime_year'] = train_data.Datetime.dt.year
train_data['Datetime_hour'] = train_data.Datetime.dt.hour
train_data[['Datetime', 'Datetime_day', 'Datetime_month', 'Datetime_year', 'Datetime_hour']].head()

In [None]:
test_data['Datetime_day'] = test_data.Datetime.dt.day
test_data['Datetime_month'] = test_data.Datetime.dt.month
test_data['Datetime_year'] = test_data.Datetime.dt.year
test_data['Datetime_hour'] = test_data.Datetime.dt.hour
test_data[['Datetime', 'Datetime_day', 'Datetime_month', 'Datetime_year', 'Datetime_hour']].head()

In [None]:
train_feats = ["Sensor1_PM2.5", "Sensor2_PM2.5", "Temperature", "Relative_Humidity", "Datetime_day", "Datetime_month", "Datetime_year", "Datetime_hour"]
X = train_data[train_feats].fillna(0)
y = train_data["Offset_fault"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
logreg = LogisticRegression(solver = 'liblinear', C = 10.0, random_state = 0)
logreg.fit(X_train, y_train)

In [None]:
p_pred = logreg.predict_proba(X_train)
y_pred = logreg.predict(X_test)

In [None]:
print("intercept: " + str(logreg.intercept_))
print("coefficient" + str(logreg.coef_))

In [None]:
print(y_test.shape)
print(X_test.shape)
print(y_pred.shape)
print(X_train.shape)
print(y_train.shape)

In [None]:
sns.scatterplot(x = train_data["Sensor1_PM2.5"], y = train_data["Offset_fault"])

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
sns.heatmap(pd.DataFrame(conf_matrix), annot = True)
plt.title("Confusion matrix plot of y_test and y_pred")
plt.ylabel("Actual label")
plt.xlabel("Predicted label")

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print("Report" + str(report))
print("Accuracy: " + str(accuracy_score(y_test, y_pred)))
print("Precision: " + str(precision_score(y_test, y_pred)))