In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [3]:
data=pd.read_csv('../data/part_2/daily_data.csv')
data.head(10)

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset
0,D0001,C001,27.0,,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,06:04 AM,07:19 PM
1,D0002,C001,22.0,,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,06:05 AM,07:18 PM
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,06:05 AM,07:18 PM
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,1,06:06 AM,07:16 PM
4,D0005,C001,18.0,,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,06:07 AM,07:15 PM
5,D0006,C001,20.0,,3.6,96,1019.0,0.0,88,0,20.0,10.0,1.0,11.2,1,06:07 AM,07:13 PM
6,D0007,C001,21.0,Partly Cloudy,4.0,310,1015.0,0.0,100,50,21.0,10.0,1.0,15.1,2,06:08 AM,07:11 PM
7,D0008,C001,21.0,,20.2,330,1011.0,0.0,53,75,21.0,10.0,1.0,17.3,1,06:09 AM,07:10 PM
8,D0009,C001,23.0,,3.6,63,1013.0,0.0,47,75,24.8,10.0,1.0,20.5,1,06:10 AM,07:08 PM
9,D0010,C001,24.0,,15.1,330,1014.0,0.0,54,75,25.3,10.0,1.0,9.7,1,06:11 AM,07:06 PM


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2893 entries, 0 to 2892
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_id                    2893 non-null   object 
 1   city_id                   2893 non-null   object 
 2   temperature_celsius       2893 non-null   float64
 3   condition_text            479 non-null    object 
 4   wind_kph                  2893 non-null   float64
 5   wind_degree               2893 non-null   int64  
 6   pressure_mb               2893 non-null   float64
 7   precip_mm                 2893 non-null   float64
 8   humidity                  2893 non-null   int64  
 9   cloud                     2893 non-null   int64  
 10  feels_like_celsius        2893 non-null   float64
 11  visibility_km             2893 non-null   float64
 12  uv_index                  2893 non-null   float64
 13  gust_kph                  2893 non-null   float64
 14  air_qual

In [5]:
train_data = data.dropna(subset=['condition_text'])

X = train_data.drop(columns=['day_id', 'city_id', 'condition_text', 'sunrise', 'sunset'])
y = train_data['condition_text']


In [6]:
# Weather conditions
conditions = ["Clear and Sunny", "Partly Cloudy", "Light Precipitation", "Cloudy and Overcast","Mist or Fog", "Rain Showers", "Light Rain with Thunder", "Thunderstorms", "Moderate to Heavy Rain"]

## Label Encoding

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(conditions)
y_encoded = label_encoder.transform(y)

## Standard Scaler

In [8]:
X.fillna(X.mean(), inplace=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [29]:
new_svm = SVC()

# Define the hyperparameters
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
grid_search = GridSearchCV(estimator=new_svm, param_grid=param_grid, cv=9, n_jobs=-1, verbose=2)
grid_search.fit(X_scaled, y_encoded)


best_rf_model = grid_search.best_estimator_
print(best_rf_model)


Fitting 9 folds for each of 80 candidates, totalling 720 fits
SVC(C=1, gamma=1, kernel='linear')


## Cross-validation

In [30]:
cv_scores = cross_val_score(best_rf_model, X_scaled, y_encoded, cv=9)

print(cv_scores)

print("Mean CV score:", np.mean(cv_scores))

[0.68518519 0.62962963 0.62264151 0.67924528 0.8490566  0.64150943
 0.69811321 0.52830189 0.69811321]
Mean CV score: 0.6701995496544763


In [22]:
#Predicting the missing values
full_X = data.drop(columns=['day_id', 'city_id', 'condition_text', 'sunrise', 'sunset'])
full_X_scaled = scaler.transform(full_X.fillna(full_X.mean()))
predictions = best_rf_model.predict(full_X_scaled)


data['condition_text'] = label_encoder.inverse_transform(predictions)

In [23]:
#Save into csv
submission = data[['day_id', 'condition_text']]
submission.to_csv('../data/part_2/submission_new_svm.csv', index=False)

In [24]:
submission.head(20)

Unnamed: 0,day_id,condition_text
0,D0001,Light Precipitation
1,D0002,Partly Cloudy
2,D0003,Light Precipitation
3,D0004,Clear and Sunny
4,D0005,Clear and Sunny
5,D0006,Clear and Sunny
6,D0007,Partly Cloudy
7,D0008,Partly Cloudy
8,D0009,Partly Cloudy
9,D0010,Partly Cloudy
