In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'crime-cast-forecasting-crime-categories:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F77420%2F8446444%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240802%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240802T061032Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D94c7196da7ada28203dad690006d34295698d2ba2f9f35758f6025a0bb37c2d73e1219641c780d04ab45348c924caa44aa729788f32a4a56b89d556f4fa8b2f64e26062c5978c711907bde966c3430a253752526dc3b5126f0b32ab9290716bb53979c272dea71c0d62e22317a05c937fad1994403935303b51bfa9a5037e45bce4758df1a62160388d84179791a1ebd95d531c708e34dbced0428722a3fdd63023745d66cca404e0c34828239ea49b2b0e5b491c794539e267245ec5f2ab855e5a8268b84ce8a3d518e362527a44bf4af9ad0cf196c1bde74bd72e1c092f85465de445d36161250532b933e5121f1aa76d5d4f5abe2e8a0f70e41eb45bfa5d2'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:

import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier

In [None]:
train= pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/train.csv")
test=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/test.csv")
sample=pd.read_csv('/kaggle/input/crime-cast-forecasting-crime-categories/sample.csv')

In [None]:
print(f"Shape of test dataset: {test.shape}")
print(f"Shape of sample_submission: {sample.shape}")

In [None]:
X=train.drop('Crime_Category',axis='columns')
y=train['Crime_Category']

In [None]:
missing_cols = set(X.columns) - set(test.columns)
for col in missing_cols:
    test[col] = np.nan


In [None]:
test = test[X.columns]

In [None]:
X_train, X_val, y_train, y_val =train_test_split(X,y,test_size=0.33, random_state=42)

In [None]:
X_train.info()

In [None]:
train.isna().sum()

In [None]:
# Plot the distribution of the target variable
plt.figure(figsize=(10, 6))
sns.countplot(x=y)
plt.title('Distribution of Crime_Category in the Training Set')
plt.xlabel('Crime_Category')
plt.ylabel('Count')
plt.show()

In [None]:
nan_cols=["Weapon_Used_Code"]
nan_cols

In [None]:
mean_imputer=SimpleImputer(strategy="mean")

In [None]:
X_train[nan_cols]=mean_imputer.fit_transform(X_train[nan_cols])
X_val[nan_cols]=mean_imputer.transform(X_val[nan_cols])
test[nan_cols]=mean_imputer.transform(test[nan_cols])

In [None]:
X_train[nan_cols]

In [None]:
Categorical_cols=X_train.select_dtypes(include=['object']).columns
Categorical_cols

In [None]:
threshold=5

In [None]:
OneHotEncoder_cols=Categorical_cols[X_train[Categorical_cols].nunique()>threshold]
OneHotEncoder_cols

In [None]:
OrdinalEncoder_cols=Categorical_cols[X_train[Categorical_cols].nunique()<=threshold]
OrdinalEncoder_cols

In [None]:
Ord_encoder=OrdinalEncoder()
Ord_encoder.fit(X_train[OrdinalEncoder_cols])

In [None]:
Oe=OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)

In [None]:
X_train[OrdinalEncoder_cols]=Oe.fit_transform(X_train[OrdinalEncoder_cols])
X_val[OrdinalEncoder_cols]=Oe.transform(X_val[OrdinalEncoder_cols])

In [None]:
test[OrdinalEncoder_cols]=Oe.transform(test[OrdinalEncoder_cols])

In [None]:
X_train.head()

In [None]:
from operator import index
Oh=OneHotEncoder(sparse_output=False ,handle_unknown='ignore')
X_train_encoded=pd.DataFrame(Oh.fit_transform(X_train[OneHotEncoder_cols]),index=X_train.index,columns=Oh.get_feature_names_out())
X_val_encoded=pd.DataFrame(Oh.transform(X_val[OneHotEncoder_cols]),index=X_val.index,columns=Oh.get_feature_names_out())
test_encoded=pd.DataFrame(Oh.transform(test[OneHotEncoder_cols]),index=test.index,columns=Oh.get_feature_names_out())
X_train_encoded.head()

In [None]:
X_train.join(X_train_encoded).drop(columns=OneHotEncoder_cols)

In [None]:
X_train=X_train.join(X_train_encoded).drop(columns=OneHotEncoder_cols)
X_val=X_val.join(X_val_encoded).drop(columns=OneHotEncoder_cols)
test=test.join(test_encoded).drop(columns=OneHotEncoder_cols)

In [None]:
X_train.info()

In [None]:
X_train=X_train.replace(-1,np.nan)
X_val=X_val.replace(-1,np.nan)
test=test.replace(-1,np.nan)

In [None]:
nan_count=X_train.isnull().sum()
nan_count

In [None]:
nan_cols=nan_count[nan_count > 0].index
nan_cols

In [None]:
Mode_imputer=SimpleImputer(strategy="most_frequent")

In [None]:
X_train.isnull().sum().sum()

In [None]:
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
cat_cols

In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

In [None]:
knn_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
knn_model.fit(X_train, y_train)

In [None]:
y_knn_pred = knn_model.predict(X_val)

In [None]:
knn_accuracy = accuracy_score(y_val, y_knn_pred)
knn_accuracy

In [None]:
print(f'Validation Accuracy of KNN : {knn_accuracy}')

In [None]:
Dc_model=DummyClassifier(strategy='most_frequent')
Dc_model.fit(X_train,y_train)

In [None]:
y_Dc_pred=Dc_model.predict(test)

In [None]:
y_Dc_pred[ :20]

In [None]:
svm_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='linear', C=1.0))
])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
svm_model.fit(X_train, y_train)

In [None]:
 y_svm_pred = svm_model.predict(X_val)

In [None]:
svm_accuracy = accuracy_score(y_val, y_svm_pred)
print(f'Validation Accuracy: {svm_accuracy}')

In [None]:
best_model = knn_model if knn_accuracy > svm_accuracy else svm_model

In [None]:
best_model

In [None]:
X_test=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/test.csv")

In [None]:
test_predictions = best_model.predict(X_test)

In [None]:
test_predictions

In [None]:
test_predictions.shape

In [None]:
submission = pd.DataFrame({"ID": np.arange(1,5001),
                           "Crime_Category": test_predictions,
                          }
                         )

submission.to_csv('submission.csv',index=False)