In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'sml-project:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F50088%2F5285395%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240707%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240707T081636Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D51649cc99fd3206fbe5c1ae9da2377b6f8cb7fba81ddbed4039b58c1d968f1691aa8b60cbfbf1d907b2e70656eb8e7a4a3bf4f85a1c403f947254a88814720ce3daad40b7223354b6b972dc756990f9222d81f0319a151159d42694d8bd50508233bc54e62026dd705a395b8053276f06f5d084d40305dba76fe60dd31508f2e6ca121416b013c03687b184b9c8236962ce2f6a0f140180406c7db1dbc40412bc8d448427f07a4556147be59209069711b86aa63791abd7637a1e6d36c30f860a4a2aeacb5881131636d7e52f01f0a819d419dfe6ac93a9288560db591591e86b435275c540cd38ad0fcc4bdf15a76058cbcd8e13626577d3b0d0a6fd3c88531'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
training_data = pd.read_csv("/kaggle/input/sml-project/train.csv")
new_x = training_data
training_data.dropna
training_data.drop_duplicates
training_data = training_data.drop('category', axis=1)
training_data=training_data.drop('ID',axis=1)

same_values_mask = (training_data == training_data.iloc[0]).all(axis=0)
training_data = training_data.loc[:, ~same_values_mask]

# zero_mask = (training_data == 0.0).all(axis=0)
# training_data = training_data.loc[:, ~zero_mask]

nonzero_mask = (training_data != 0).sum(axis=0) < 25
training_data = training_data.loc[:, ~nonzero_mask]
training_data.head()

In [None]:
testing_data = pd.read_csv("/kaggle/input/sml-project/test.csv")
new_y = testing_data
testing_data.dropna
testing_data.drop_duplicates
testing_data=testing_data.drop('ID',axis=1)

testing_data = testing_data.loc[:, ~same_values_mask]
testing_data = testing_data.loc[:, ~nonzero_mask]

testing_data.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import RFE
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

Y_output = new_x["category"]
X_modelfitting = training_data
X_testing = testing_data

In [None]:
isolationtree = IsolationForest()
outliers = isolationtree.fit_predict(X_modelfitting)
number_outliers = len(outliers[outliers == -1])
print("Number of outliers:", number_outliers)
mask = outliers != -1
X_modelfitting = X_modelfitting[mask]
Y_output = Y_output[mask]

In [None]:
pca_to_best_components = PCA(n_components=1000)
X_modelfitting = pca_to_best_components.fit_transform(X_modelfitting)
X_testing = pca_to_best_components.transform(X_testing)

In [None]:
pca_to_best_components = PCA(n_components=400)
X_modelfitting = pca_to_best_components.fit_transform(X_modelfitting)
X_testing = pca_to_best_components.transform(X_testing)

In [None]:
estimator_using_rfe = LogisticRegression(max_iter=10000)
selector_using_rfe = RFE(estimator_using_rfe, n_features_to_select=379, step=1)
selector_using_rfe.fit(X_modelfitting, Y_output)
X_modelfitting = X_modelfitting[:,selector_using_rfe.support_]
X_testing = X_testing[:,selector_using_rfe.support_]

In [None]:
for i in range(1):
    X_modelfitting_df = pd.DataFrame(data=X_modelfitting)
    # Apply KMeans Clustering for grouping similar samples together on X_modelfitting
    cluster_model = KMeans(n_clusters=4)
    cluster_assignments_train = cluster_model.fit_predict(X_modelfitting)

    # Add the cluster assignments as a new column in X_modelfitting
    X_modelfitting_df = pd.DataFrame(data=X_modelfitting)
    X_modelfitting_df[f'cluster{i}'] = cluster_assignments_train

    # Create a new feature based on the cluster assignments
    cluster_counts = X_modelfitting_df[f'cluster{i}'].value_counts(normalize=True)
    X_modelfitting_df[f'cluster_ratio{i}'] = X_modelfitting_df[f'cluster{i}'].map(cluster_counts)

    # Transform X_testing using the cluster model fit on X_modelfitting
    X_testing_df = pd.DataFrame(data=X_testing)
    cluster_assignments_test = cluster_model.predict(X_testing_df)
    X_testing_df[f'cluster{i}'] = cluster_assignments_test
    X_testing_df[f'cluster_ratio{i}'] = X_testing_df[f'cluster{i}'].map(cluster_counts)

    # Select only the cluster id and cluster ratio columns from train and test sets
    X_modelfitting_cluster = X_modelfitting_df[[f'cluster{i}', f'cluster_ratio{i}']].to_numpy()
    X_testing_cluster =X_testing_df[[f'cluster{i}', f'cluster_ratio{i}']].to_numpy()

In [None]:
linear_analysis = LinearDiscriminantAnalysis(n_components = 19)
X_train_lda = linear_analysis.fit_transform(X_modelfitting, Y_output)
X_test_lda = linear_analysis.transform(X_testing)

In [None]:
# Concatenate X_train_cluster and X_train_final
X_train_final = np.concatenate((X_modelfitting_cluster, X_train_lda), axis=1)

# Concatenate X_test_cluster and X_test_final
X_test_final = np.concatenate((X_testing_cluster, X_test_lda), axis=1)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(max_iter=10000, C=0.02)

# Create a bagging classifier with 10 base estimators (logistic regression models)
bagging = BaggingClassifier(base_estimator=logistic, n_estimators=300)

# Train the bagging classifier on the training data
bagging.fit(X_train_final, Y_output)

# Make predictions on the test data using the bagging classifier
Y_predicted = bagging.predict(X_test_final)

print(Y_predicted)

In [None]:
output = pd.DataFrame({'ID': new_y.ID, 'category': Y_predicted})
output.to_csv('submission.csv', index=False)

In [None]:
from IPython.display import FileLink
FileLink(r'submission.csv')