In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("hacktrain.csv")

In [3]:
df.shape

(8000, 30)

In [4]:
df.isnull().sum()

Unnamed: 0       0
ID               0
class            0
20150720_N     560
20150602_N    1200
20150517_N     800
20150501_N     960
20150415_N     480
20150330_N    1120
20150314_N     720
20150226_N    1360
20150210_N     640
20150125_N    1040
20150109_N     880
20141117_N    1280
20141101_N     400
20141016_N    1440
20140930_N     800
20140813_N     560
20140626_N    1600
20140610_N     480
20140525_N     720
20140509_N     880
20140423_N    1760
20140407_N     640
20140322_N    1120
20140218_N    1440
20140202_N     560
20140117_N    1200
20140101_N     400
dtype: int64

In [5]:
df = df.drop(columns=['Unnamed: 0', 'ID'])
df.head()

Unnamed: 0,class,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,20150210_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,water,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,,-1325.64,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,water,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,-1670.59,-1408.64,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,water,58.0174,-1599.16,,-1052.63,,-1564.63,,729.79,-3162.14,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,water,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,683.254,-2829.4,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,water,1136.44,,,1647.83,1935.8,,2158.98,,1242.87,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [6]:
label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

In [7]:
new_df = df.drop(columns=['class'])

In [8]:
new_df.head()

Unnamed: 0,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,20150210_N,20150125_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,637.595,658.668,-1882.03,-1924.36,997.904,-1739.99,630.087,,-1325.64,-944.084,...,,-1043.16,-1942.49,267.138,,,211.328,-2203.02,-1180.19,433.906
1,634.24,593.705,-1625.79,-1672.32,914.198,-692.386,707.626,-1670.59,-1408.64,-989.285,...,,-933.934,-625.385,120.059,364.858,476.972,220.878,-2250.0,-1360.56,524.075
2,58.0174,-1599.16,,-1052.63,,-1564.63,,729.79,-3162.14,,...,-1025.88,368.622,,-1227.8,304.621,,369.214,-2202.12,,-1343.55
3,72.518,,380.436,-1256.93,515.805,-1413.18,-802.942,683.254,-2829.4,-1267.54,...,-1813.95,155.624,,-924.073,432.15,282.833,298.32,-2197.36,,-826.727
4,1136.44,,,1647.83,1935.8,,2158.98,,1242.87,-2646.05,...,1535.0,1959.43,-279.317,-384.915,-113.406,1020.72,1660.65,-116.801,-568.05,-1357.14


In [9]:
new_df = new_df.fillna(new_df.mean())

In [10]:
new_df.isnull().sum()

20150720_N    0
20150602_N    0
20150517_N    0
20150501_N    0
20150415_N    0
20150330_N    0
20150314_N    0
20150226_N    0
20150210_N    0
20150125_N    0
20150109_N    0
20141117_N    0
20141101_N    0
20141016_N    0
20140930_N    0
20140813_N    0
20140626_N    0
20140610_N    0
20140525_N    0
20140509_N    0
20140423_N    0
20140407_N    0
20140322_N    0
20140218_N    0
20140202_N    0
20140117_N    0
20140101_N    0
dtype: int64

In [11]:
print(df['class'].isnull().sum())

0


In [12]:
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

remove_outlier = remove_outliers(new_df,new_df)

In [13]:
scaler = StandardScaler()
standardized_df = scaler.fit_transform(remove_outlier)
print(standardized_df)

[[-1.74955562 -1.42448064 -1.63362804 ... -3.05718798  0.09445484
  -0.50801061]
 [-1.42845889 -1.13215424 -1.50258682 ... -0.06044144  0.66831239
  -0.33322868]
 [-1.66965744 -1.16983364 -0.02592791 ... -2.79354926  1.04777536
  -0.39407617]
 ...
 [-0.23904833  1.28273895 -1.63674219 ... -0.99732906 -0.54007701
   0.65246844]
 [ 0.28025415  1.28265893 -1.56441219 ...  0.66507319 -0.51146223
   2.04911258]
 [-1.17107795 -1.16686039 -1.56401709 ... -2.04265238  0.7301014
  -0.93919687]]


In [14]:
standardized_df = pd.DataFrame(standardized_df, columns=remove_outlier.columns)

In [15]:
X = standardized_df
y = df.loc[remove_outlier.index, 'class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [16]:
poly_model = Pipeline([
    ('poly', PolynomialFeatures(degree=3, include_bias=False)),  
    ('logreg', LogisticRegression(max_iter=1000))
])

In [17]:
poly_model.fit(X_train, y_train)

In [18]:
y_pred = poly_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

Accuracy: 0.9403606102635229


In [19]:
y_pred = poly_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9403606102635229
              precision    recall  f1-score   support

           0       0.82      0.91      0.86       115
           1       0.99      0.97      0.98       531
           2       0.95      0.78      0.86        23
           3       0.75      0.80      0.78        50
           4       0.50      0.50      0.50         2

    accuracy                           0.94       721
   macro avg       0.80      0.79      0.80       721
weighted avg       0.94      0.94      0.94       721



In [20]:
y_pred = poly_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9403606102635229

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.91      0.86       115
           1       0.99      0.97      0.98       531
           2       0.95      0.78      0.86        23
           3       0.75      0.80      0.78        50
           4       0.50      0.50      0.50         2

    accuracy                           0.94       721
   macro avg       0.80      0.79      0.80       721
weighted avg       0.94      0.94      0.94       721


Confusion Matrix:
 [[105   4   0   6   0]
 [ 12 514   0   4   1]
 [  1   1  18   3   0]
 [  9   0   1  40   0]
 [  1   0   0   0   1]]


In [21]:
test_data = pd.read_csv('hacktest.csv')
test_data.shape

(2845, 29)

In [22]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,ID,20150720_N,20150602_N,20150517_N,20150501_N,20150415_N,20150330_N,20150314_N,20150226_N,...,20140610_N,20140525_N,20140509_N,20140423_N,20140407_N,20140322_N,20140218_N,20140202_N,20140117_N,20140101_N
0,0,1,7466.42,413.162,5761.0,5625.45,489.403,3923.84,3097.11,6766.42,...,801.184,927.115,4704.14,6378.42,340.949,2695.57,527.268,4736.75,601.843,6639.76
1,1,2,7235.26,6037.35,1027.56,6085.14,1618.05,6668.54,2513.99,1051.69,...,5533.47,5103.04,5216.12,4885.27,4366.79,1234.14,3298.11,6942.68,1070.44,842.101
2,2,3,7425.08,6969.98,1177.94,7408.93,861.061,7644.43,814.458,1504.29,...,1981.39,6204.54,7021.69,5704.41,4897.45,1789.99,2206.1,6928.93,1036.56,831.441
3,3,4,7119.12,1731.62,6311.93,6441.61,465.979,7128.42,1649.12,6935.22,...,959.344,5794.15,1045.57,5572.9,586.287,685.906,1287.0,6734.72,824.584,6883.61
4,4,5,7519.55,8130.26,1482.54,7879.53,1001.21,7937.6,4122.53,1094.51,...,7636.07,6996.76,7413.43,4596.13,4511.7,1413.52,3283.94,7937.68,1857.8,1336.92


In [23]:
test_ids = test_data['ID']

In [24]:
test_data.isnull().sum()

Unnamed: 0    0
ID            0
20150720_N    0
20150602_N    0
20150517_N    0
20150501_N    0
20150415_N    0
20150330_N    0
20150314_N    0
20150226_N    0
20150210_N    0
20150125_N    0
20150109_N    0
20141117_N    0
20141101_N    0
20141016_N    0
20140930_N    0
20140813_N    0
20140626_N    0
20140610_N    0
20140525_N    0
20140509_N    0
20140423_N    0
20140407_N    0
20140322_N    0
20140218_N    0
20140202_N    0
20140117_N    0
20140101_N    0
dtype: int64

In [25]:
test_data = test_data.drop(columns=['ID', 'Unnamed: 0'])

In [26]:
scaler = StandardScaler()
standardized_df = scaler.fit_transform(test_data)
print(standardized_df)

[[ 0.91827497 -1.532337    0.63141698 ... -0.34225952 -0.79964212
   1.69934801]
 [ 0.8209536   0.56409878 -1.0380865  ...  0.57157763 -0.5898897
  -0.6978452 ]
 [ 0.90087029  0.91173988 -0.98504687 ...  0.5658815  -0.605055
  -0.70225286]
 ...
 [-3.2207124  -1.74434205 -1.90208799 ... -4.06899823 -0.88456777
  -1.53011091]
 [-3.49017336 -2.14003033 -1.33698132 ... -4.61384225 -0.9353169
  -0.85566157]
 [-3.47774084 -2.10747777 -1.23694528 ... -4.56364175 -0.93139755
  -0.87356302]]


In [27]:
standardized_df = pd.DataFrame(standardized_df, columns=test_data.columns)

In [28]:
X_test = standardized_df

In [29]:
poly_model.fit(X_train, y_train)
test_preds = poly_model.predict(X_test)


In [30]:
test_preds = label_encoder.inverse_transform(test_preds)

In [31]:
submission = pd.DataFrame({
    'ID': test_ids,
    'class': test_preds
    })
submission.to_csv('submission.csv', index=False)

In [32]:
df = pd.read_csv('submission.csv')
df.head()

Unnamed: 0,ID,class
0,1,forest
1,2,orchard
2,3,forest
3,4,farm
4,5,orchard
