### Prepare and clean the data

In [1]:
import pandas as pd

In [2]:
%%time

train = pd.read_parquet('data_fusion_train.parquet')

Wall time: 17.7 s


In [3]:
%%time

train = train[train.category_id != -1]
train['weight'] = 1
train_unique = train.groupby('item_name').agg({'category_id': 'first', 'weight': 'sum'}).reset_index()

Wall time: 3.77 s


In [11]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords

stop = stopwords.words('russian')

tfidf = TfidfVectorizer(stop_words = stop, ngram_range=(1,2), max_features=10000)
X_train = tfidf.fit_transform(train_unique.item_name)
y_train = train_unique['category_id']

Wall time: 981 ms


In [12]:
class_weights = train_unique.groupby('category_id').agg({'weight': 'sum'})
class_weights /= class_weights.sum()

class_weights = class_weights.sort_values('weight', ascending=False)
class_weights

Unnamed: 0_level_0,weight
category_id,Unnamed: 1_level_1
203,0.205393
84,0.138665
80,0.102909
78,0.077039
71,0.065114
...,...
100,0.000048
1,0.000035
106,0.000028
97,0.000017


In [13]:
class_weights = class_weights.to_dict()['weight']

In [14]:
from sklearn.metrics import f1_score, make_scorer
import numpy as np

def f1_weighted(y, p):
    resulted_f1 = []
    for c in y.unique():
        f1 = f1_score(y == c, p == c)
        resulted_f1.append(f1 * class_weights[c])
        
    return np.sum(resulted_f1) 

f1_weighted_sc = make_scorer(f1_weighted)

### Comparing different models

In [15]:
def validate(model):
    scores = cross_val_score(model, X_train, y_train, cv=8, scoring=f1_weighted_sc, n_jobs=-1)
    print(np.mean(scores), ' ', np.std(scores))

In [17]:
%%time

from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

Wall time: 0 ns


In [18]:

validate(LinearSVC())

0.7057776519722815   0.06387441873327576


In [27]:
from sklearn.linear_model import LogisticRegression
validate(LogisticRegression())

0.5679528562547214   0.029861366420197773


In [19]:
from sklearn.neighbors import KNeighborsClassifier
validate(KNeighborsClassifier(3))

0.5050306056047401   0.05505787541860906


In [20]:
from sklearn.svm import SVC
validate(SVC())

NameError: name 'SVC' is not defined

In [10]:
%%time

clf = LinearSVC()
_ = clf.fit(X_train, y_train)

CPU times: user 3.1 s, sys: 27.1 ms, total: 3.13 s
Wall time: 3.13 s


### Creating submission for website

In [11]:
import pickle

pickle.dump(tfidf, open('t1_sub/tfidf', 'wb'))
pickle.dump(clf, open('t1_sub/clf_task1', 'wb'))

In [12]:
!ls -lh t1_sub

итого 12M
-rw-r--r-- 1 dmitry.dremov dmitry.dremov  312 янв 30 21:37 answers.csv
-rw-r--r-- 1 dmitry.dremov dmitry.dremov 7,4M янв 30 21:38 clf_task1
drwxr-xr-x 2 dmitry.dremov dmitry.dremov 4,0K янв 30 18:06 data
-rw-r--r-- 1 dmitry.dremov dmitry.dremov  352 янв 30 18:11 script.py
-rw-r--r-- 1 dmitry.dremov dmitry.dremov 2,9M янв 30 21:38 submission.zip
-rw-r--r-- 1 dmitry.dremov dmitry.dremov 1,3M янв 30 21:38 tfidf


In [13]:
import zipfile 
compression = zipfile.ZIP_DEFLATED

submission_name = 't1_sub/submission.zip'
with zipfile.ZipFile(submission_name, 'w') as zipObj:
    for filename in [
        'clf_task1',
        'tfidf',
        'script.py',
    ]:
        zipObj.write(
            f't1_sub/{filename}', 
            arcname=filename, 
            compress_type=compression
        )
    print(zipObj.namelist())

!ls -lh {submission_name}

['clf_task1', 'tfidf', 'script.py']
-rw-r--r-- 1 dmitry.dremov dmitry.dremov 2,9M янв 30 21:38 t1_sub/submission.zip


# Debug run

In [14]:
!cd t1_sub; python3 script.py

In [15]:
pd.read_csv('t1_sub/answers.csv').head(5)

Unnamed: 0,id,pred
0,0,80
1,1,204
2,2,204
3,3,71
4,4,79
