In [11]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

# Load datasets
data_train = pd.read_csv('train2.csv')
data_test = pd.read_csv('test2.csv', index_col=0)

# Check and drop duplicated values in training data
print(data_train.duplicated().sum())
data_train.drop_duplicates(keep='first', inplace=True)

# Check null values in both datasets
print(data_train.isnull().sum())
print(data_test.isnull().sum())

# Preprocess the 'Dates' column to extract features
for data in [data_train, data_test]:
    data['Dates'] = pd.to_datetime(data['Dates'])
    data['Year'] = data['Dates'].dt.year
    data['Month'] = data['Dates'].dt.month
    data['Day'] = data['Dates'].dt.day
    data['Hour'] = data['Dates'].dt.hour
    data.drop('Dates', axis=1, inplace=True)

# Data preprocessing
# Drop unnecessary columns from training data and split features and labels
X_train = data_train.drop(['Category', 'Resolution', 'Descript'], axis=1)
y_train = data_train['Category']

# Encode categorical features and labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
X_train['PdDistrict'] = le.fit_transform(X_train['PdDistrict'])
X_train['DayOfWeek'] = le.fit_transform(X_train['DayOfWeek'])
X_train['Address'] = le.fit_transform(X_train['Address'])

# Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

# Train XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)

# Preprocess test data
# Encode categorical features
X_test = data_test
X_test['PdDistrict'] = le.fit_transform(X_test['PdDistrict'])
X_test['DayOfWeek'] = le.fit_transform(X_test['DayOfWeek'])
X_test['Address'] = le.fit_transform(X_test['Address'])

# Scale test data
X_test = sc.transform(X_test)

# Make predictions
preds = model.predict(X_test)

# Create and save output DataFrame
output = pd.DataFrame({"Id": data_test.index, "Predictions": preds})
output.to_csv('submission.csv', index=False)

2323
Dates         0
Category      0
Descript      0
DayOfWeek     0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
dtype: int64
Dates         0
DayOfWeek     0
PdDistrict    0
Address       0
X             0
Y             0
dtype: int64
