In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import neighbors, svm
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score

In [2]:
def binary_columns(data: pd.DataFrame, column_names: list):
    for column_name in column_names:
        unique_values = list(data[column_name].unique())

        for unique_value in unique_values:
            if unique_value != 'unknown':
                data['{}_{}'.format(column_name, unique_value)] = (data[column_name] == unique_value).astype(int)
        data = data.drop([column_name], axis = 1)
    return data

In [3]:
def encode(name):
    enc = OrdinalEncoder()
    return pd.Series(enc.fit_transform(df[name].fillna('').values.reshape(-1, 1))[:, -1].astype(int), name=name)

In [4]:
df = pd.read_csv('part.csv', dtype={'gamecategory':'category', 'subgamecategory':'category', 'oblast':'category', 'os':'category', 'osv':'category'})
df = df.dropna()
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv
0,0,0,0,4,Games,Racing,com.MadOut.BIG,2021-07-05 18:07:40,MSK+6,Забайкальский Край,Чита,android,10.0
2,2,2,2,5,Games,Arcade,com.orbitalknight.ridiculousfreekick,2021-08-04 13:34:29,MSK,Санкт-Петербург,Санкт-Петербург,android,9.0
5,5,5,5,5,Games,Puzzle,com.hwg.sos,2021-09-18 09:44:24,MSK,Татарстан,Альметьевск,android,7.1
6,6,6,6,4,Games,Arcade,com.hikergames.ArcadeHunter,2021-08-02 08:52:24,MSK,Москва,Москва,android,11.0
7,7,7,7,5,Games,Card,com.lemongame.klondike.solitaire,2021-09-18 02:36:11,MSK,Санкт-Петербург,Санкт-Петербург,android,8.1.0


In [5]:
chunk_list = []

# read the large csv file with specified chunksize 
with pd.read_csv('train.csv', chunksize=1000000) as reader:
    for chunk in reader:
        filtered = chunk.drop('city', axis=1)
        # perform data filtering 
        # Once the data filtering is done, append the chunk to list
        chunk_list.append(filtered)
    
# concat the list into dataframe 
df = pd.concat(chunk_list)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44854516 entries, 0 to 44854515
Data columns (total 9 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   Segment          int64 
 1   gamecategory     object
 2   subgamecategory  object
 3   bundle           object
 4   created          object
 5   shift            object
 6   oblast           object
 7   os               object
 8   osv              object
dtypes: int64(1), object(8)
memory usage: 3.0+ GB


In [7]:
#df = df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'city'], axis = 1)
# drop columns
#df = encode(df, ['gamecategory', 'subgamecategory', 'oblast', 'os', 'osv'])
# create one-hot vectors
X = pd.concat([encode(name) for name in ['gamecategory', 'subgamecategory', 'oblast', 'os', 'osv']], axis=1)

In [8]:
#bundle_splitted = df['bundle'].str.split(pat='.')

#df['app_owner'] = [bundle_part[1] if bundle_part is list and len(bundle_part) > 1 else '' for bundle_part in bundle_splitted]

KeyboardInterrupt: 

In [None]:
df = df.drop(['bundle'], axis=1)

In [9]:
date_elements = df['created'].str.split(pat='-')
df['month'] = [int(date_part[1]) if len(date_part) > 2 else 0 for date_part in date_elements]
df['day'] = [int(date_part[2][:2]) if len(date_part) > 2 else 0 for date_part in date_elements]
df

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,os,osv,month,day
0,4,Games,Racing,com.MadOut.BIG,2021-07-05 18:07:40,MSK+6,Забайкальский Край,android,10.0,7,5
1,4,,,com.easybrain.solitaire.klondike.free,2021-07-10 10:38:42,MSK+2,Оренбургская область,Android,10.0.0,7,10
2,5,Games,Arcade,com.orbitalknight.ridiculousfreekick,2021-08-04 13:34:29,MSK,Санкт-Петербург,android,9.0,8,4
3,5,,,tcouchgind.scooterextreme.scooter,2021-08-06 07:35:27,MSK+2,Свердловская область,android,9,8,6
4,4,,,com.FidgetTrading3D.game,2021-08-02 20:43:59,MSK,Московская область,android,6.0.1,8,2
...,...,...,...,...,...,...,...,...,...,...,...
44854511,3,Games,Simulation,1068204657,2021-08-07 17:19:23,MSK,Краснодарский край,ios,14.4.2,8,7
44854512,3,Games,Puzzle,com.easybrain.nonogram.color,2021-08-02 09:17:16,MSK,Владимирская область,android,11.0,8,2
44854513,5,Games,Arcade,com.nordcurrent.canteenhd,2021-09-16 09:26:38,MSK,Брянская область,android,5.1,9,16
44854514,4,,,com.fugo.wow,2021-07-09 18:02:33,MSK,Татарстан,Android,7.1.2,7,9


In [10]:
#df = df.drop(['created', 'shift'], axis=1)
month = df['month']
day = df['day']

In [11]:
X = X.reset_index(drop=True)
month = month.reset_index(drop=True)
day = day.reset_index(drop=True)

X = X.join(month, day)

In [13]:
X.to_csv('X.csv')

In [None]:
#df = binary_columns(df, ['app_owner'])
df = df.drop(['app_owner'], axis=1)

# create one-hot vectors
df

In [15]:
y = df['Segment']
#df = df.drop('Segment', axis=1)
#y = y.loc[200000:300000]
#y
y.to_csv('y.csv')

In [16]:
#X = df.loc[200000:300000]
#X
del df

In [None]:
step = 10000

indexes = np.arange(0, len(X.index)/step)

In [26]:
np.random.shuffle(indexes)

In [37]:
X

Unnamed: 0,gamecategory,subgamecategory,oblast,os,osv,month
0,8,40,21,2,6,9
1,0,0,56,1,8,8
2,8,3,65,2,237,8
3,0,0,69,2,235,8
4,0,0,49,2,196,8
...,...,...,...,...,...,...
44854511,8,43,36,4,121,9
44854512,8,39,14,2,28,8
44854513,8,3,11,2,183,9
44854514,0,0,74,1,215,8


In [27]:
indexes

array([127., 343.,  46., 107., 375., 318., 310., 380., 381., 217., 319.,
       143., 132.,  70., 443., 251., 267., 218., 184., 297., 361.,  66.,
       121.,  41., 428., 219., 237., 377., 369.,  38., 278., 289.,  34.,
        91., 430., 387., 437., 330., 114., 440.,  52., 313., 142., 342.,
       303., 329., 139., 328., 229., 259., 282., 231., 424., 261., 446.,
       164., 409., 396., 304., 131., 140., 226., 108., 365., 169., 122.,
       167., 410.,  14., 151., 262., 158., 225., 233., 374., 178., 111.,
       295., 224., 323., 298.,   1., 215., 411., 209.,  86., 407., 388.,
       192., 120., 284., 191.,  43., 141., 183.,   6.,  31., 332., 247.,
       135., 444.,  88.,  60., 376.,  64., 311.,  30., 241., 281., 294.,
       124., 244., 216., 223., 128., 368., 248., 389.,  22.,  79., 384.,
       307., 138., 176., 285., 288., 190., 416.,  58., 201., 399., 408.,
       373., 350.,   2., 106., 367., 398.,  48., 243., 397., 338., 386.,
       413.,  56., 292., 390., 185.,  71., 258., 16

In [29]:
from sklearn.decomposition import PCA

In [41]:
pca = PCA(n_components=2)

In [31]:
svc = svm.SVC(kernel='rbf', gamma=5)

In [32]:
import pickle

In [None]:
epoch = 1

for i in indexes:
    start = int(i * step)
    stop = int((i + 1) * step) if (i + 1) * step < len(X.index) else len(X.index) * step - 1
    X_s = X[start:stop].values
    y_s = y[start:stop].values
    
    print(f'Epoch {epoch}:')
    
    X_s = pca.fit_transform(X_s)
    
    print('Transformed.')
    
    X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, test_size = 0.3, random_state = 0)
    
    print('Splited.')
    
    svc.fit(X_train, y_train)
    
    print('Trained.')
    
    score = svc.score(X_test, y_test)
    
    #score = cross_val_score(model, X, y, cv= kf, scoring="accuracy")
    
    #print(f' Scores for each fold are: {score}')
    print(f'Average score: {"{:.2f}".format(score)}\n')
    
    with open(f'models/SVM_{epoch}.pkl','wb') as f:
        pickle.dump(model, f)

[3 3 3 ... 3 2 5] 12700000 12800000
Epoch 1:
Transformed.
Splited.
