# Encode Smote-n 
This notebook encodes the "smote-encoded.cvs" dataset. Importantly, when we implemented the Catboost, we didn't do any encoding since it was not required. Thus this notebook will encode the existing "smote-encoded.cvs", with no trasnformation. 


In [7]:
import torch
from torch import nn
import numpy as np
import scipy
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import sklearn 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler

In [205]:
df = pd.read_csv('../data/atlassian-allianz-final-submission-data/encoded_smoten_preprocessed_allianz_atlassian.csv', index_col=0)

# drop day_of_week_bind
df.drop(columns=['day_of_week_bind','day_of_week_incident', 'insured_zip'], inplace=True)

# Dict of ordinal/nominal/quant o/n/q 
df__data_type_dict = {
    "months_as_customer": 'q',
    "age": 'o',
    "policy_deductable": 'o',
    "policy_annual_premium": 'q',
    "umbrella_limit": 'o',
    "capital-gains": 'q',
    "incident_severity": 'o',
    "incident_hour_of_the_day": 'q',
    "number_of_vehicles_involved": 'o',
    "bodily_injuries": 'n',
    "witnesses": 'n',
    "total_claim_amount": 'q',
    "injury_claim": 'q',
    "property_claim": 'q',
    "vehicle_claim": 'q',
    "auto_year": 'q',
    "fraud_reported": 'n',
    "frequency": 'o',
    "year_bind": 'q',
    "month_bind": 'q',
    "date_bind": 'q',
    "is_weekend_bind": 'q',
    "policy_state_IL": 'n',
    "policy_state_IN": 'n',
    "policy_state_OH": 'n',
    "policy_csl_100/300": 'n',
    "policy_csl_250/500": 'n',
    "policy_csl_500/1000": 'n',
    "umbrella_limit_indicator": 'o',
    "frequency_zip": 'q',
    "capital-gain_indicator": 'n',
    "year_incident": 'q',
    "month_incident": 'q',
    "date_incident": 'q',
    "day_of_week_incident": 'q',
    "is_weekend_incident": 'n',
    "incident_Multi-vehicle Collision": 'n',
    "incident_Parked Car": 'n',
    "incident_Single Vehicle Collision": 'n',
    "incident_Vehicle Theft": 'n',
    "collision_?": 'n',
    "collision_Front Collision": 'n',
    "collision_Rear Collision": 'n',
    "collision_Side Collision": 'n',
    "auth_Ambulance": 'n',
    "auth_Fire": 'n',
    "auth_Other": 'n',
    "auth_Police": 'n',
    "auth_Unknown": 'n',
    "incident_state_NC": 'n',
    "incident_state_NY": 'n',
    "incident_state_OH": 'n',
    "incident_state_PA": 'n',
    "incident_state_SC": 'n',
    "incident_state_VA": 'n',
    "incident_state_WV": 'n',
    "incident_city_Arlington": 'n',
    "incident_city_Columbus": False, # rest are all nominal
    "incident_city_Hillsdale": False,
    "incident_city_Northbend": False,
    "incident_city_Northbrook": False,
    "incident_city_Riverwood": False,
    "incident_city_Springfield": False,
    "property_damge_?": False,
    "property_damge_NO": False,
    "property_damge_YES": False,
    "police_?": False,
    "police_NO": False,
    "police_YES": False,
    "total_claim_amount_indicator": False,
    "injury_claim_amount_indicator": False,
    "property_claim_amount_indicator": False,
    "vehicle_claim_amount_indicator": False,
    "make_Accura": False,
    "make_Audi": False,
    "make_BMW": False,
    "make_Chevrolet": False,
    "make_Dodge": False,
    "make_Ford": False,
    "make_Honda": False,
    "make_Jeep": False,
    "make_Mercedes": False,
    "make_Nissan": False,
    "make_Saab": False,
    "make_Suburu": False,
    "make_Toyota": False,
    "make_Volkswagen": False,
    "auto_model_3 Series": False,
    "auto_model_92x": False,
    "auto_model_93": False,
    "auto_model_95": False,
    "auto_model_A3": False,
    "auto_model_A5": False,
    "auto_model_Accord": False,
    "auto_model_C300": False,
    "auto_model_CRV": False,
    "auto_model_Camry": False,
    "auto_model_Civic": False,
    "auto_model_Corolla": False,
    "auto_model_E400": False,
    "auto_model_Escape": False,
    "auto_model_F150": False,
    "auto_model_Forrestor": False,
    "auto_model_Fusion": False,
    "auto_model_Grand Cherokee": False,
    "auto_model_Highlander": False,
    "auto_model_Impreza": False,
    "auto_model_Jetta": False,
    "auto_model_Legacy": False,
    "auto_model_M5": False,
    "auto_model_MDX": False,
    "auto_model_ML350": False,
    "auto_model_Malibu": False,
    "auto_model_Maxima": False,
    "auto_model_Neon": False,
    "auto_model_Passat": False,
    "auto_model_Pathfinder": False,
    "auto_model_RAM": False,
    "auto_model_RSX": False,
    "auto_model_Silverado": False,
    "auto_model_TL": False,
    "auto_model_Tahoe": False,
    "auto_model_Ultima": False,
    "auto_model_Wrangler": False,
    "auto_model_X5": False,
    "auto_model_X6": False
}

for key in df__data_type_dict:
    if df__data_type_dict[key] == False:
        df__data_type_dict[key] = 'n'



In [207]:
# Finding number of categories
class Encode:
    
    """
    
    Encodes based on datatype which is stored in df__data_type_dict:
    "o" - ordinal -> MinMaxScaler
    "n" - nominal -> OneHotEncoder
    "q" - quantitative -> MinMaxScaler
        
    """
    
    def __init__(self, df: pd.DataFrame, df__data_type_dict: dict):
        self.df = df 
        self.df__data_type_dict = df__data_type_dict
    
    def main(self):
        for col in self.df.columns:
            
            try:
                # Check in df__data_type_dict
                dtype = self.df__data_type_dict[col]
                if dtype == 'q' or dtype == 'o' :
                    # MinMaxScaler() 
                    self.encode_quant_ordinal(col)
                elif dtype == 'n':
                    # OneHotEncoder()
                    self.encode_nominal(col)
                else:
                    print(f"Something wrong col: {col} and dtype {dtype}")
                    
            except ValueError:
                print(f"Error Occured in {col}")
                
    def encode_quant_ordinal(self, col:str):
        minmax_scaler = MinMaxScaler()
        self.df[col] = minmax_scaler.fit_transform(self.df[[col]])


    def encode_nominal(self, col:str):
        onehotencoder = OneHotEncoder(sparse_output=False, drop='first') 
        self.df[col] = onehotencoder.fit_transform(self.df[[col]])
        


In [233]:
encode = Encode(df=df, df__data_type_dict=df__data_type_dict)
encode.main()
df_encoded = encode.df


In [239]:
from pathlib import Path
directory_path = Path('../data/01_encoded_no_transformations/')
file_with_path = directory_path / '01_encoded_no_transformations.csv'

directory_path.mkdir(parents=True, exist_ok=True)
#df_encoded.to_csv(p)
df_encoded.to_csv(file_with_path, index=False)