In [None]:
!kaggle competitions download -c tpsbsdaa2122

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.tree import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.preprocessing import scale
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

### **Algumas funções para tratamento de dados**

#### **Tratamento do campo 'record_date'**

In [None]:
def daypart(hour):
    if hour > 0 and hour < 7:
        return "dawn"
    elif hour >= 7 and hour <= 10:
        return "early morning"
    elif hour > 10 and hour < 12:
        return "late morning"
    elif hour >= 12 and hour <= 14:
        return "lunch"
    elif hour > 14 and hour < 17:
        return "early afternoon"
    elif hour >= 17 and hour <= 20:
        return "late afternoon"
    elif hour > 20 and hour < 22:
        return "evening"
    else:
        return "midnight"
    
def is_weekend(day_name):
    if day_name == 'Saturday' or day_name == 'Sunday':
        return 1
    else:
        return 0

def is_friday(day_name):
    if day_name == 'Friday':
        return 1
    else:
        return 0 
    
def season(month):
    if month > 3 and month < 6:
        return "spring"
    elif month >= 6 and month <= 9:
        return "summer"
    elif month > 9 and month < 12:
        return "fall"
    else:
        return "winter"

# Função principal que trata do campo 'record_date'
# Passam a existir colunas que identificam a altura do dia, a estação do ano, bem como se é ou não fim-de-semana/sexta-feira/quinta-feira
def handle_date(df):
    if 'Is_weekend' not in df:
        df.record_date = pd.to_datetime(df.record_date)
        df['Month'] = df.record_date.dt.month
        df['Hour'] = df.record_date.dt.hour
        df['Day_name'] = df.record_date.dt.day_name()
        df['Day_Part'] = df['Hour'].apply(daypart)
        df['Is_weekend'] = df['Day_name'].apply(is_weekend)
        df['Is_friday'] = df['Day_name'].apply(is_friday)
        df['Season'] = df['Month'].apply(season)
        one_hot_dates = pd.get_dummies(df['Day_Part'])
        #one_hot_seasons = pd.get_dummies(df['Season'])
        df = pd.concat([df, one_hot_dates], axis=1)
        df = df.drop(['Month', 'Hour', 'Day_name', 'Day_Part', 'Season', 'record_date'],axis=1)
    return df
        

### **Tratamento dos dados**

In [None]:
training_file = '../input/trabalho/training_data.csv'
test_file = '../input/trabalho/test_data.csv'

def tratamentoDados(filename):
    df = pd.read_csv(filename,encoding = "ISO-8859-1")
    df = df.drop(['city_name','AVERAGE_PRECIPITATION','AVERAGE_RAIN'],axis=1)
    df['LUMINOSITY'].replace({"DARK": 0, "LOW_LIGHT": 1, "LIGHT": 2}, inplace=True)
    df['AVERAGE_CLOUDINESS'].replace({"céu limpo": 0, "céu claro": 1, "nuvens dispersas": 2,"nuvens quebrados": 3,"nuvens quebradas": 3,"algumas nuvens": 4,"céu pouco nublado":5,'tempo nublado': 6,'nublado': 6}, inplace=True)
    df['AVERAGE_CLOUDINESS']=df['AVERAGE_CLOUDINESS'].fillna(method='bfill')
    df['AVERAGE_CLOUDINESS']=df['AVERAGE_CLOUDINESS'].fillna(method='ffill')
    df = handle_date(df)
    if 'AVERAGE_SPEED_DIFF' in df:
        df['AVERAGE_SPEED_DIFF'].replace({"None": 0, "Low": 1, "Medium": 2, "High": 3, "Very_High": 4}, inplace=True)
    return df

def reloadDataset(df):
    df = pd.read_csv(training_file,encoding = "ISO-8859-1")

# Support Vector Machine -- Testes Maria

In [None]:
df_training = tratamentoDados(training_file)
df_test = tratamentoDados(test_file)

In [None]:
x = df_training.drop(['AVERAGE_SPEED_DIFF'], axis=1)
y = df_training['AVERAGE_SPEED_DIFF'].to_frame()

In [None]:
cross_valid_model = SVC(random_state=2021)
scores = cross_val_score(cross_valid_model, x, np.ravel(y), cv=10)
scores

In [None]:
scores.mean()

In [None]:
# Split for test
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20, random_state=2000)

In [None]:
x_train_scaled = scale(x_train)
x_test_scaled = scale(x_test)

In [None]:
x_train_scaled

In [None]:
y_train_array = y_train.values.ravel()

In [None]:
model = SVC(random_state=2021)
model.fit(x_train_scaled,y_train_array)

In [None]:
predictions = model.predict(x_test_scaled)

In [None]:
accuracy_score(y_test, predictions)

In [None]:
C = [0.1,1,10,100,1000]
gamma = [1,0.1,0.01,0.001,0.0001]
kernel = ['rbf']

param_grid = {'C':C, 'gamma':gamma, 'kernel':kernel}
grid = GridSearchCV(SVC(random_state=2021),param_grid,refit=True,verbose=3)
grid.fit(x_train_scaled, y_train_array)

### teste

In [None]:
df_test_scaled = scale(df_test)
grid_predictions = grid.predict(df_test_scaled)

In [None]:
grid_predictions

In [None]:
tryMaria = pd.DataFrame(data=grid_predictions, index=None, columns=['Speed_Diff'])
tryMaria

In [None]:
tryMaria['RowId'] = range(1,1501)
tryMaria['Speed_Diff'].replace({0:"None", 1:"Low", 2:"Medium", 3:"High", 4:"Very_High"}, inplace=True)
tryMaria = tryMaria[['RowId','Speed_Diff']] # Inverter a ordem das colunas para concordar com o formato desejado
tryMaria

In [None]:
try_nr = 1
outputfile = f'maria_try{try_nr}.csv'

tryMaria.to_csv(outputfile,index=False)
submission = pd.read_csv(outputfile,encoding = "ISO-8859-1")
submission.head()