In [1]:
%load_ext autoreload
%autoreload 2


# Import Libraries

In [2]:
import os

import pandas as pd
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import accuracy_score


# Read Data

In [3]:
path = "../data/"
train_data = pd.read_csv(os.path.join(path, "train.csv"))
test_data = pd.read_csv(os.path.join(path, "test.csv"))
print(f"Number of rows and columns in the train data set: {train_data.shape}")
print(f"Number of rows and columns in the test data set: {test_data.shape}")
train_data.head()


Number of rows and columns in the train data set: (8050, 4)
Number of rows and columns in the test data set: (2684, 3)


Unnamed: 0,movie_name,target,movie_description,id
0,Furies,0,Three furious vigilantes unite to take down a ...,133529636342330622371894152500993949030
1,RRR,0,The story of freedom fighters Komaram Bheem an...,133529660110779376651195430564179049830
2,John Wick,0,Legendary assassin John Wick (Keanu Reeves) re...,133529680710101630359923204885606137190
3,John Wick: Chapter 3 -- Parabellum,0,After gunning down a member of the High Table ...,133529687048354631501070212369122164070
4,Top Gun: Maverick,0,After more than thirty years of service as one...,133529699724860633783364227336154217830


In [4]:
train_data.groupby("target").describe()


Unnamed: 0_level_0,movie_name,movie_name,movie_name,movie_name,movie_description,movie_description,movie_description,movie_description,id,id,id,id
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
0,920,908,The Mummy,2,920,920,Three furious vigilantes unite to take down a ...,1,920,920,133529636342330622371894152500993949030,1
1,1096,1075,The Three Musketeers,4,1096,1096,An interactive comedy adventure about two indo...,1,1096,1096,133537091712423214646061704986720566630,1
2,1140,1133,Rembrandt,2,1140,1140,"Zlatan Ibrahimovic, the son of Balkan immigran...",1,1140,1140,133546465196330277259842397171484818790,1
3,1589,1578,Ocean's Eleven,2,1589,1589,"Aware that her time is running short, irascibl...",1,1589,1589,133555542366909536525000489501874814310,1
4,1699,1645,Halloween,3,1699,1699,Returning to an eerily empty hotel after a day...,1,1699,1699,133568186389365187970637043180908936550,1
5,1606,1590,Always Be My Maybe,2,1606,1606,In order to save her relationship from falling...,1,1606,1606,133581667853498615190321960619498110310,1


# Preparing the data and creating Catboost model

In [5]:
X_train = train_data["movie_description"]
y_train = train_data["target"]

X_test = test_data["movie_description"]


model = CatBoostClassifier(
    iterations=100,
    depth=5,
    random_seed=42
)

model.fit(
    X_train,
    y_train,
    text_features=[0],
    verbose=True
)


Learning rate set to 0.5
0:	learn: 0.6379966	total: 215ms	remaining: 21.3s
1:	learn: 0.4115786	total: 342ms	remaining: 16.7s
2:	learn: 0.3088364	total: 495ms	remaining: 16s
3:	learn: 0.2512520	total: 643ms	remaining: 15.4s
4:	learn: 0.2143676	total: 770ms	remaining: 14.6s
5:	learn: 0.1921620	total: 905ms	remaining: 14.2s
6:	learn: 0.1753263	total: 1.04s	remaining: 13.8s
7:	learn: 0.1564486	total: 1.18s	remaining: 13.5s
8:	learn: 0.1494784	total: 1.3s	remaining: 13.2s
9:	learn: 0.1489393	total: 1.43s	remaining: 12.9s
10:	learn: 0.1453907	total: 1.56s	remaining: 12.6s
11:	learn: 0.1422216	total: 1.68s	remaining: 12.3s
12:	learn: 0.1372412	total: 1.8s	remaining: 12.1s
13:	learn: 0.1366207	total: 1.92s	remaining: 11.8s
14:	learn: 0.1321891	total: 2.04s	remaining: 11.6s
15:	learn: 0.1309876	total: 2.16s	remaining: 11.3s
16:	learn: 0.1240254	total: 2.29s	remaining: 11.2s
17:	learn: 0.1222762	total: 2.4s	remaining: 10.9s
18:	learn: 0.1219894	total: 2.52s	remaining: 10.7s
19:	learn: 0.1091740	

<catboost.core.CatBoostClassifier at 0x7f8d344fecb0>

# Predict

In [6]:
# Preparing data in Pool format
dataset_test = Pool(
    data=X_test,
    text_features=[0]
)
predict_classes = model.predict(dataset_test)
predictions = predict_classes


# Create submission

In [7]:
sample_submission = pd.read_csv(os.path.join(path, "sample_submission.csv"))
sample_submission["target"] = predictions
sample_submission.head()


Unnamed: 0,id,target
0,133529667241314002934985813983134580070,5
1,133529693386607632642217219852638190950,5
2,133529737754378640630246272237250379110,5
3,133529756769137644053687294687798459750,1
4,133529828866765532034234504812793265510,5


In [8]:
sample_submission.to_csv("submission.csv", index=False)
