# Installing dependencies

In [46]:
!pip install pandas scikit-learn numpy xgboost optuna gplearn mlxtend seaborn matplotlib hyperopt



#  Additional data 

I used an additional dataset from this github repo: **https://github.com/zie225/Machine_learning-final_project**

# Importing necessary libraries and data 

In [47]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score


# Load the datasets
train_data1 = pd.read_csv('train.csv')
train_data2 = pd.read_csv('export_alimconfiance.csv', delimiter=";")
test_data = pd.read_csv('test.csv')


## create a new column in the new used data set 

In [48]:
#Adresse_2_UA is misssing in the new dataset so we will just replace missing values with _  ('_' also represent missing values in the datasets given by the compeition host)
train_data2['Adresse_2_UA'] = '_'

## Merging datasets

In [49]:
train_data = pd.concat([train_data1, train_data2], ignore_index=True)

# Display the merged DataFrame
train_data.shape

(52438, 13)

## Pre-processing

In [50]:

def preprocess_data(input_df):
    # Convert the "Date_inspection" column to datetime format
    input_df["Date_inspection"] = pd.to_datetime(input_df["Date_inspection"], utc=True)

    # Extract day, year, and month
    input_df["Inspection_Year"] = input_df["Date_inspection"].dt.year
    input_df["Inspection_Month"] = input_df["Date_inspection"].dt.month
    input_df["Inspection_Day"] = input_df["Date_inspection"].dt.day

    # Extract Day of the Week
    input_df["Inspection_DayOfWeek"] = input_df["Date_inspection"].dt.dayofweek

    # Extract Week of the Year
    input_df["Inspection_WeekOfYear"] = input_df["Date_inspection"].dt.isocalendar().week

    # Extract Quarter
    input_df["Inspection_Quarter"] = input_df["Date_inspection"].dt.quarter

    # Extract Season
    input_df["Inspection_Season"] = (input_df["Date_inspection"].dt.month % 12 + 3) // 3

    # Text-based Features
    # Fill missing values with a constant value
    input_df['APP_Libelle_etablissement'] = input_df['APP_Libelle_etablissement'].fillna("Unknown")
    tfidf = TfidfVectorizer(max_features=50, stop_words='english')
    establishment_name_tfidf = tfidf.fit_transform(input_df['APP_Libelle_etablissement'].astype(str)).toarray()
    tfidf_df = pd.DataFrame(establishment_name_tfidf, columns=[f'establishment_name_tfidf_{i}' for i in range(50)])
    input_df = pd.concat([input_df, tfidf_df], axis=1)

    # Length of Establishment Name and Address
    input_df['address_length'] = input_df['Adresse_2_UA'].astype(str).apply(len)

    # Temporal Features
    input_df['is_weekend'] = (input_df['Inspection_DayOfWeek'] >= 5).astype(int)
    input_df['is_business_hours'] = ((input_df['Date_inspection'].dt.hour >= 9) & (input_df['Date_inspection'].dt.hour <= 17)).astype(int)

    # Drop the original "Date_inspection" column
    input_df.drop("Date_inspection", axis=1, inplace=True)

    # Fill missing values in 'Adresse_2_UA' based on the same 'Code_postal'
    input_df['Adresse_2_UA'] = input_df.groupby('Code_postal')['Adresse_2_UA'].transform(
        lambda x: x.fillna(x.mode().iloc[0]) if not x.isnull().all() else x
    )

    # Split 'geores' column into two columns
    input_df['geores'] = input_df['geores'].fillna("0_0")
    input_df[['establishment_Longitude', 'establishment_Latitude']] = input_df['geores'].str.split('_', expand=True)

    # Convert the new columns to float, keeping missing values
    input_df[['establishment_Longitude', 'establishment_Latitude']] = input_df[['establishment_Longitude', 'establishment_Latitude']].apply(pd.to_numeric, errors='coerce')

    # Fill missing values in 'Adresse_2_UA' based on the same 'Code_postal'
    input_df['Adresse_2_UA'] = input_df.groupby('Code_postal')['Adresse_2_UA'].transform(
        lambda x: x.fillna(x.mode().iloc[0]) if not x.isnull().all() else x
    )

    # Fill missing values with a constant value
    input_df['filtre'] = input_df['filtre'].fillna("Unknown")

    # Fill missing values in 'Agrement' with a constant value
    input_df['Agrement'] = input_df['Agrement'].fillna("Unknown")

    # Fill missing values in 'establishment_Latitude' with the most common value
    input_df['establishment_Latitude'] = input_df['establishment_Latitude'].fillna(input_df['establishment_Latitude'].mean())

    # Fill missing values with a constant value in 'Adresse_2_UA'
    input_df['Adresse_2_UA'] = input_df['Adresse_2_UA'].fillna(input_df['Adresse_2_UA'].mode().iloc[0])

    # Additional features
    input_df['establishment_name_length'] = input_df['APP_Libelle_etablissement'].apply(len)
    input_df['activity_details_length'] = input_df['filtre'].apply(len)
    input_df['industry_code'] = input_df['SIRET'].astype(str).str[:2]
    input_df['region_code'] = input_df['SIRET'].astype(str).str[2:5]
    input_df['street_name'] = input_df['Adresse_2_UA'].str.split(',').str[0]

    # Convert 'industry_code' and 'region_code' to numeric with handling of errors
    input_df['industry_code'] = pd.to_numeric(input_df['industry_code'], errors='coerce')
    input_df['region_code'] = pd.to_numeric(input_df['region_code'], errors='coerce')

    # Interaction Features
    input_df['industry_region_interaction'] = input_df['industry_code'].astype(str) + '_' + input_df['region_code'].astype(str)

    # Drop the original 'geores' column
    input_df = input_df.drop('geores', axis=1)

    # Fill missing values in 'industry_code' and 'establishment_Longitude' with the mean and median
    input_df['industry_code'] = input_df['industry_code'].fillna(input_df['industry_code'].median())
    input_df['establishment_Longitude'] = input_df['establishment_Longitude'].fillna(input_df['establishment_Longitude'].mean())
    input_df['region_code'] = input_df['region_code'].fillna(input_df['region_code'].median())

    return input_df

# Assuming train_data and test_data are your original datasets
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)


In [51]:
# Convert numerical features to floats
numerical_features = ['Inspection_Year', 'Inspection_Month', 'Inspection_Day',
    'Inspection_DayOfWeek', 'Inspection_WeekOfYear', 'Inspection_Quarter',
    'Inspection_Season', 'establishment_Longitude',
    'establishment_Latitude', 'establishment_name_length', 'activity_details_length',
    'industry_code', 'region_code'
]

train_data[numerical_features] = train_data[numerical_features].astype(float)
test_data[numerical_features] = test_data[numerical_features].astype(float)


## Cheking mssing values

In [52]:
train_data.isna().sum()

APP_Libelle_etablissement      0
SIRET                          0
Adresse_2_UA                   0
Code_postal                    0
Libelle_commune                0
                              ..
activity_details_length        0
industry_code                  0
region_code                    0
street_name                    0
industry_region_interaction    0
Length: 79, dtype: int64

## Modeling

In [53]:
# Define categorical and numerical features that will be used 
categorical_features = ['APP_Libelle_etablissement', 'SIRET', 'Code_postal',
                         'Libelle_commune', 'Numero_inspection', 'APP_Libelle_activite_etablissement',
                         'filtre', 'ods_type_activite', 'street_name',
                         'industry_region_interaction', 'Agrement']
numerical_features = ['Inspection_Year', 'Inspection_Month', 'Inspection_Day',
                       'Inspection_DayOfWeek', 'Inspection_WeekOfYear', 'Inspection_Quarter',
                       'Inspection_Season', 'establishment_Longitude',
                       'establishment_Latitude', 'establishment_name_length', 'activity_details_length',
                       'industry_code', 'region_code']

# Combine training and test data for ordinal encoding
combined_data = pd.concat([train_data, test_data], ignore_index=True)

# Create an instance of OrdinalEncoder
encoder = OrdinalEncoder()

# Fit and transform the categorical features on combined data
encoded_categorical_features_combined = encoder.fit_transform(combined_data[categorical_features])

# Display the transformed features for training data
train_data_encoded = pd.DataFrame(encoded_categorical_features_combined[:len(train_data)], columns=categorical_features)

# Combine numerical and encoded categorical features for the training data
features_train = pd.concat([train_data_encoded, train_data[numerical_features]], axis=1)

# Encode the target variable for training data
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(train_data['Synthese_eval_sanit'])

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(features_train, target_encoded, test_size=0.3, random_state=42)

# Use the best hyperparameters obtained from Optuna
best_hyperparameters = {'subsample': 0.8807451487381295, 'n_estimators': 881, 'min_child_weight': 2,
                        'max_depth': 14, 'learning_rate': 0.023535353535353534, 'colsample_bytree': 0.21950136279318744}

# Initialize XGBoost classifier with the best hyperparameters
best_xgb_classifier = XGBClassifier(
    objective='multi:softmax',
    num_class=len(label_encoder.classes_),
    eval_metric='mlogloss',
    use_label_encoder=False,
    **best_hyperparameters
)

# Fit the model on the training set
best_xgb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = best_xgb_classifier.predict(X_test)

# Convert the predicted labels back to original class labels
y_pred_original = label_encoder.inverse_transform(y_pred)

# Calculate and print accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Fit the model on the entire training set with the best hyperparameters
best_xgb_classifier.fit(features_train, target_encoded)


KeyboardInterrupt: 

## preparing submission file

In [None]:
# Encode categorical features for test data
encoded_categorical_features_test = encoder.transform(test_data[categorical_features])
test_data_encoded = pd.DataFrame(encoded_categorical_features_test, columns=categorical_features)

# Combine numerical and encoded categorical features for the test data
test_features = pd.concat([test_data_encoded, test_data[numerical_features]], axis=1)



# Make predictions on the test set using XGBoost classifier with top N features
predictions = best_xgb_classifier.predict(test_features)
predictions = label_encoder.inverse_transform(predictions)


test_data=pd.read_csv('test.csv')

# Update the 'Synthese_eval_sanit' column with the XGBoost predictions using top N features
test_data['Synthese_eval_sanit'] = predictions

# Save the updated test_data to a CSV file
test_data.to_csv('result.csv', index=False, encoding='UTF-8')
