# Importing Libraries

In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from googletrans import Translator
pd.set_option('display.max_columns', 1000000, 'display.max_rows', 1000000)

# File descriptions
* **sales_train.csv** - the training set. Daily historical data from January 2013 to October 2015.
* **test.csv** - the test set. You need to forecast the sales for these shops and products for November 2015.
* **sample_submission.csv** - a sample submission file in the correct format.
* **items.csv** - supplemental information about the items/products.
* **item_categories.csv**  - supplemental information about the items categories.
* **shops.csv**- supplemental information about the shops.

# Data fields
* **ID** - an Id that represents a (Shop, Item) tuple within the test set
* **shop_id** - unique identifier of a shop
* **item_id** - unique identifier of a product
* **item_category_id** - unique identifier of item category
* **item_cnt_day** - number of products sold. You are predicting a monthly amount of this measure
* **item_price** - current price of an item
* **date** - date in format dd/mm/yyyy
* **date_block_num** - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
* **item_name** - name of item
* **shop_name** - name of shop
* **item_category_name** - name of item category

# Reading Data Set

In [2]:
sales_train = pd.read_csv("sales_train.csv")
items = pd.read_csv("items.csv")
item_categories = pd.read_csv("item_categories.csv") 
shops = pd.read_csv("shops.csv")

# Getting basic insight of data frames

In [3]:
print(" sales_train -> ", sales_train.shape, "\n",
     "items -> ", items.shape, "\n",
     "item_categories -> ", item_categories.shape, "\n",
     "shops -> ", shops.shape)

 sales_train ->  (2935849, 6) 
 items ->  (22170, 3) 
 item_categories ->  (84, 2) 
 shops ->  (60, 2)


### ----------------------------------------->>     sales_train  <<------------------------------------------------------###

In [4]:
sales_train.columns, len(sales_train.columns)

(Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
        'item_cnt_day'],
       dtype='object'), 6)

In [5]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [6]:
sales_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2935849 entries, 0 to 2935848
Data columns (total 6 columns):
date              object
date_block_num    int64
shop_id           int64
item_id           int64
item_price        float64
item_cnt_day      float64
dtypes: float64(2), int64(3), object(1)
memory usage: 134.4+ MB


In [7]:
sales_train.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641
std,9.422988,16.22697,6324.297,1729.8,2.618834
min,0.0,0.0,0.0,-1.0,-22.0
25%,7.0,22.0,4476.0,249.0,1.0
50%,14.0,31.0,9343.0,399.0,1.0
75%,23.0,47.0,15684.0,999.0,1.0
max,33.0,59.0,22169.0,307980.0,2169.0


### ----------------------------------------->>     items    <<------------------------------------------------------###

In [8]:
items.columns, len(items.columns)

(Index(['item_name', 'item_id', 'item_category_id'], dtype='object'), 3)

In [9]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [10]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 3 columns):
item_name           22170 non-null object
item_id             22170 non-null int64
item_category_id    22170 non-null int64
dtypes: int64(2), object(1)
memory usage: 519.7+ KB


In [11]:
items.describe()

Unnamed: 0,item_id,item_category_id
count,22170.0,22170.0
mean,11084.5,46.290753
std,6400.07207,15.941486
min,0.0,0.0
25%,5542.25,37.0
50%,11084.5,40.0
75%,16626.75,58.0
max,22169.0,83.0


### ----------------------------------------->>     item_categories    <<------------------------------------------------------###

In [12]:
item_categories.columns, len(item_categories.columns)

(Index(['item_category_name', 'item_category_id'], dtype='object'), 2)

In [13]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [14]:
item_categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 2 columns):
item_category_name    84 non-null object
item_category_id      84 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


In [15]:
item_categories.describe()

Unnamed: 0,item_category_id
count,84.0
mean,41.5
std,24.392622
min,0.0
25%,20.75
50%,41.5
75%,62.25
max,83.0


### ----------------------------------------->>     shops    <<------------------------------------------------------###

In [16]:
shops.columns, len(shops.columns)

(Index(['shop_name', 'shop_id'], dtype='object'), 2)

In [17]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [18]:
shops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
shop_name    60 non-null object
shop_id      60 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.0+ KB


In [19]:
shops.describe()

Unnamed: 0,shop_id
count,60.0
mean,29.5
std,17.464249
min,0.0
25%,14.75
50%,29.5
75%,44.25
max,59.0


### Concatinating dataframes

In [20]:
sales_train_CC_items = sales_train.merge(items, 
                                         on="item_id")
sales_train_CC_items_CC_item_categories = sales_train_CC_items.merge(item_categories, 
                                                                     on="item_category_id")
sales_train_CC_items_CC_item_categories_CC_shops = sales_train_CC_items_CC_item_categories.merge(shops, 
                                                                                                 on="shop_id")
print(sales_train_CC_items_CC_item_categories_CC_shops.shape)
print(sales_train_CC_items_CC_item_categories_CC_shops.columns, 
      len(sales_train_CC_items_CC_item_categories_CC_shops.columns))

(2935849, 10)
Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day', 'item_name', 'item_category_id', 'item_category_name',
       'shop_name'],
      dtype='object') 10


In [21]:
sales_train_CC_items_CC_item_categories_CC_shops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2935849 entries, 0 to 2935848
Data columns (total 10 columns):
date                  object
date_block_num        int64
shop_id               int64
item_id               int64
item_price            float64
item_cnt_day          float64
item_name             object
item_category_id      int64
item_category_name    object
shop_name             object
dtypes: float64(2), int64(4), object(4)
memory usage: 246.4+ MB


In [22]:
sales_train_CC_items_CC_item_categories_CC_shops.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641,40.00138
std,9.422988,16.22697,6324.297,1729.8,2.618834,17.10076
min,0.0,0.0,0.0,-1.0,-22.0,0.0
25%,7.0,22.0,4476.0,249.0,1.0,28.0
50%,14.0,31.0,9343.0,399.0,1.0,40.0
75%,23.0,47.0,15684.0,999.0,1.0,55.0
max,33.0,59.0,22169.0,307980.0,2169.0,83.0


### Creating copy of final merged data set

In [23]:
finalDF = sales_train_CC_items_CC_item_categories_CC_shops.copy()
finalDF.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,item_category_name,shop_name
0,02.01.2013,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
1,26.04.2013,3,59,944,150.0,1.0,2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
2,26.06.2013,5,59,944,199.5,1.0,2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
3,20.07.2013,6,59,944,199.5,1.0,2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""
4,14.09.2013,8,59,944,299.0,2.0,2012 (BD),37,Кино - Blu-Ray,"Ярославль ТЦ ""Альтаир"""


In [66]:
finalDF.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_category_id
count,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0,2935849.0
mean,14.56991,33.00173,10197.23,890.8532,1.242641,40.00138
std,9.422988,16.22697,6324.297,1729.8,2.618834,17.10076
min,0.0,0.0,0.0,-1.0,-22.0,0.0
25%,7.0,22.0,4476.0,249.0,1.0,28.0
50%,14.0,31.0,9343.0,399.0,1.0,40.0
75%,23.0,47.0,15684.0,999.0,1.0,55.0
max,33.0,59.0,22169.0,307980.0,2169.0,83.0


### Removing missing values i.e -1

In [80]:
finalDF[(finalDF.date_block_num == -1)]

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,day,month,year,en_item_category_name,en_shop_name


In [81]:
finalDF = finalDF[~(finalDF.item_cnt_day == -1)]
finalDF = finalDF[~(finalDF.item_price == -1)]

# Feature Extraction

Splitting date based on dots

In [24]:
dateSplit = finalDF.date.str.split(".", expand=True)
dateSplit.shape

(2935849, 3)

In [25]:
dateSplit.columns = ["day", "month", "year"]
dateSplit.head()

Unnamed: 0,day,month,year
0,2,1,2013
1,26,4,2013
2,26,6,2013
3,20,7,2013
4,14,9,2013


In [26]:
finalDF = pd.concat([finalDF, dateSplit], axis = 1).drop('date', axis=1)

In [27]:
finalDF.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_price', 'item_cnt_day',
       'item_name', 'item_category_id', 'item_category_name', 'shop_name',
       'day', 'month', 'year'],
      dtype='object')

### translating Russian text to English text

Converting **item_category_name**

In [28]:
unique_item_category_name = pd.unique(finalDF.item_category_name)
unique_item_category_name = list(unique_item_category_name)
unique_item_category_name

['Кино - Blu-Ray',
 'Музыка - Винил',
 'Музыка - CD фирменного производства',
 'Музыка - Музыкальное видео',
 'Музыка - CD локального производства',
 'Игры - XBOX 360',
 'Игры - PS3',
 'Игры PC - Дополнительные издания',
 'Игры PC - Стандартные издания',
 'Игры - PSP',
 'Кино - DVD',
 'Программы - Для дома и офиса',
 'Книги - Методические материалы 1С',
 'Игры PC - Коллекционные издания',
 'Игры - PSVita',
 'Подарки - Развитие',
 'Программы - 1С:Предприятие 8',
 'Программы - Обучающие',
 'Музыка - MP3',
 'Музыка - Подарочные издания',
 'Аксессуары - PSP',
 'Подарки - Гаджеты, роботы, спорт',
 'Книги - Аудиокниги',
 'Игровые консоли - XBOX 360',
 'Аксессуары - PS3',
 'Аксессуары - PS4',
 'Аксессуары - PSVita',
 'Карты оплаты - PSN',
 'Карты оплаты - Live!',
 'Аксессуары - XBOX 360',
 'Кино - Blu-Ray 3D',
 'Игры - Аксессуары для игр',
 'Игровые консоли - PSVita',
 'Книги - Аудиокниги 1С',
 'Кино - Коллекционное',
 'Подарки - Открытки, наклейки',
 'Игровые консоли - PS3',
 'Подарки - Суве

In [29]:
[re.split('[,]', i) for i in unique_item_category_name]

[['Кино - Blu-Ray'],
 ['Музыка - Винил'],
 ['Музыка - CD фирменного производства'],
 ['Музыка - Музыкальное видео'],
 ['Музыка - CD локального производства'],
 ['Игры - XBOX 360'],
 ['Игры - PS3'],
 ['Игры PC - Дополнительные издания'],
 ['Игры PC - Стандартные издания'],
 ['Игры - PSP'],
 ['Кино - DVD'],
 ['Программы - Для дома и офиса'],
 ['Книги - Методические материалы 1С'],
 ['Игры PC - Коллекционные издания'],
 ['Игры - PSVita'],
 ['Подарки - Развитие'],
 ['Программы - 1С:Предприятие 8'],
 ['Программы - Обучающие'],
 ['Музыка - MP3'],
 ['Музыка - Подарочные издания'],
 ['Аксессуары - PSP'],
 ['Подарки - Гаджеты', ' роботы', ' спорт'],
 ['Книги - Аудиокниги'],
 ['Игровые консоли - XBOX 360'],
 ['Аксессуары - PS3'],
 ['Аксессуары - PS4'],
 ['Аксессуары - PSVita'],
 ['Карты оплаты - PSN'],
 ['Карты оплаты - Live!'],
 ['Аксессуары - XBOX 360'],
 ['Кино - Blu-Ray 3D'],
 ['Игры - Аксессуары для игр'],
 ['Игровые консоли - PSVita'],
 ['Книги - Аудиокниги 1С'],
 ['Кино - Коллекционное'],

In [30]:
translator = Translator()
translatedList = translator.translate(unique_item_category_name, dest='en')

In [31]:
translatedEnglishList = []
for i in translatedList:
    translatedEnglishList.append((i.origin, i.text))
translatedEnglishListdf = pd.DataFrame({"item_category_name": [translatedEnglishList[k][0] for k in range(len(translatedEnglishList))],
                                       "en_item_category_name": [translatedEnglishList[k][1] for k in range(len(translatedEnglishList))]
                                       })

In [32]:
translatedEnglishListdf[translatedEnglishListdf['item_category_name'] == translatedEnglishListdf['en_item_category_name']]

Unnamed: 0,item_category_name,en_item_category_name
73,Книги - Путеводители,Книги - Путеводители


In [33]:
translatedEnglishListdf.iloc[73]['en_item_category_name'] = "Books - Travel Guides"

In [34]:
finalDF = finalDF.merge(translatedEnglishListdf, on="item_category_name")

In [35]:
finalDF.drop('item_category_name', axis=1, inplace=True)

Converting **shop_name**

In [36]:
unique_shop_name = pd.unique(finalDF.shop_name)
unique_shop_name = list(unique_shop_name)
unique_shop_name

['Ярославль ТЦ "Альтаир"',
 'Москва ТК "Буденовский" (пав.К7)',
 'Москва ТЦ "МЕГА Белая Дача II"',
 'Москва ТРК "Атриум"',
 'Воронеж (Плехановская, 13)',
 'Калуга ТРЦ "XXI век"',
 'Воронеж ТРЦ "Максимир"',
 'Москва ТЦ "Семеновский"',
 'Химки ТЦ "Мега"',
 'СПб ТК "Невский Центр"',
 'Омск ТЦ "Мега"',
 'Новосибирск ТЦ "Мега"',
 'Сергиев Посад ТЦ "7Я"',
 'Самара ТЦ "Мелодия"',
 'Тюмень ТЦ "Зеленый Берег"',
 'Коломна ТЦ "Рио"',
 '!Якутск ТЦ "Центральный" фран',
 '!Якутск Орджоникидзе, 56 фран',
 'Москва ТЦ "МЕГА Теплый Стан" II',
 'Якутск Орджоникидзе, 56',
 'Москва ТК "Буденовский" (пав.А2)',
 'Якутск ТЦ "Центральный"',
 'Чехов ТРЦ "Карнавал"',
 'Н.Новгород ТРЦ "Фантастика"',
 'Сургут ТРЦ "Сити Молл"',
 'Москва МТРЦ "Афи Молл"',
 'Москва Магазин С21',
 'Курск ТЦ "Пушкинский"',
 'Красноярск ТЦ "Июнь"',
 'Воронеж ТРЦ Сити-Парк "Град"',
 'Москва ТЦ "Перловский"',
 'РостовНаДону ТЦ "Мега"',
 'Самара ТЦ "ПаркХаус"',
 'Москва ТЦ "Серебряный Дом"',
 'Москва ТЦ "Новый век" (Новокосино)',
 'Тюмень 

In [37]:
translatedList_shop_name = translator.translate(unique_shop_name, dest='en')

In [38]:
translatedEnglishList_shop_name = []
for i in translatedList_shop_name:
    translatedEnglishList_shop_name.append((i.origin, i.text))
translatedEnglishList_shop_namedf = pd.DataFrame({"shop_name": [translatedEnglishList_shop_name[k][0] for k in range(len(translatedEnglishList_shop_name))],
                                       "en_shop_name": [translatedEnglishList_shop_name[k][1] for k in range(len(translatedEnglishList_shop_name))]
                                       })

In [39]:
translatedEnglishList_shop_namedf[translatedEnglishList_shop_namedf['shop_name'] == \
                                  translatedEnglishList_shop_namedf['en_shop_name']]

Unnamed: 0,shop_name,en_shop_name


In [40]:
finalDF = finalDF.merge(translatedEnglishList_shop_namedf, on="shop_name")
finalDF.drop('shop_name', axis=1, inplace=True)

In [41]:
finalDF.head(10)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,day,month,year,en_item_category_name,en_shop_name
0,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,2,1,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""
1,3,59,944,150.0,1.0,2012 (BD),37,26,4,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""
2,5,59,944,199.5,1.0,2012 (BD),37,26,6,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""
3,6,59,944,199.5,1.0,2012 (BD),37,20,7,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""
4,8,59,944,299.0,2.0,2012 (BD),37,14,9,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""
5,9,59,944,299.0,1.0,2012 (BD),37,21,10,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""
6,10,59,944,299.0,1.0,2012 (BD),37,2,11,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""
7,12,59,944,299.0,1.0,2012 (BD),37,6,1,2014,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""
8,14,59,944,299.0,1.0,2012 (BD),37,30,3,2014,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""
9,15,59,944,199.0,1.0,2012 (BD),37,20,4,2014,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair"""


Converting **item_name**

In [42]:
unique_item_name = pd.unique(finalDF.item_name)
unique_item_name = list(unique_item_name)
unique_item_name

['ЯВЛЕНИЕ 2012 (BD)',
 '2012 (BD)',
 '28 ДНЕЙ СПУСТЯ (BD)',
 'ALL INCLUSIVE, ИЛИ ВСЕ ВКЛЮЧЕНО (BD)',
 '9 (ДЕВЯТЬ) (BD)',
 'ЧЕЛОВЕК ДОЖДЯ (BD)',
 'ЧУЖОЙ ПРОТИВ ХИЩНИКА (BD)',
 'ШЕРЛОК. СЕЗОН 2 (BD)',
 'ФОРСАЖ 5 (BD+DVD)',
 'ФРАНЦУЗСКИЙ СВЯЗНОЙ (BD)',
 'ХИЖИНА В ЛЕСУ (BD)',
 'ХРАБРОЕ СЕРДЦЕ (BD)',
 'ХРОНИКА (BD)',
 'ХРОНИКИ НАРНИИ. ПОКОРИТЕЛЬ ЗАРИ (BD+DVD)',
 'ЦАРСТВО НЕБЕСНОЕ (BD)',
 'ЭВОЛЮЦИЯ БОРНА (BD)',
 'ЭДВАРД РУКИ-НОЖНИЦЫ (BD)',
 '1+1 (BD)',
 'СОЛОВЕЙ-РАЗБОЙНИК (BD)',
 'СУДЬЯ ДРЕДД 3D (BD)',
 'СХВАТКА (BD)',
 'ТАЧКИ 2 (BD)',
 'ТЕМНЫЙ РЫЦАРЬ WB (BD)',
 'СЕКС ПО ДРУЖБЕ (BD)',
 'ТОР (BD)',
 'ТРАНСФОРМЕРЫ (BD)',
 'ТРАНСФОРМЕРЫ 3. ТЁМНАЯ СТОРОНА ЛУНЫ (BD)',
 'ТРАНСФОРМЕРЫ: МЕСТЬ ПАДШИХ (BD)',
 'ПОЛНОЧЬ В ПАРИЖЕ (BD)',
 'ПРИКЛЮЧЕНИЯ БУРАТИНО  э (BD)',
 'ПЕТЛЯ ВРЕМЕНИ (BD)',
 'ОГРАБЛЕНИЕ КАЗИНО (BD)',
 'ОСОБО ОПАСНЫ (BD)',
 'ПРОМЕТЕЙ (BD)',
 'С НОВЫМ ГОДОМ, МАМЫ! (BD)',
 'ЧЕЛЮСТИ 3D (BD)',
 'ФЕИ: ТАЙНА ЗИМНЕГО ЛЕСА (BD)',
 'ХОЗЯИН МОРЕЙ. НА КРАЮ ЗЕМЛИ (BD)',
 'ОБЛАЧНЫЙ АТЛАС (BD)',
 'ДЕН

In [43]:
len(unique_item_name)

21807

In [44]:
unique_item_name_df = pd.DataFrame({'item_name': unique_item_name})

In [84]:
finalDF.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,day,month,year,en_item_category_name,en_shop_name,item_category
0,0,59,22154,999.0,1.0,ЯВЛЕНИЕ 2012 (BD),37,2,1,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair""",0
1,3,59,944,150.0,1.0,2012 (BD),37,26,4,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair""",0
2,5,59,944,199.5,1.0,2012 (BD),37,26,6,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair""",0
3,6,59,944,199.5,1.0,2012 (BD),37,20,7,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair""",0
4,8,59,944,299.0,2.0,2012 (BD),37,14,9,2013,Cinema - Blu-ray,"Yaroslavl shopping center ""Altair""",0


In [204]:
unique_item_name_df['item_category'] = 0
item_type_check = {'pc': 1,
                   'PC': 1,
                   ' MAC': 2,
                   '[[(]MAC': 2,
                   'Mac': 2,
                   'xbox': 3,
                   'XBOX': 3,
                   'X-Box': 3,
                   'X360': 3,
                   'ps[0-9]': 4,
                   'Ps[0-9]': 4,
                   'PS[0-9]': 4,
                   'Android': 5,
                   'android': 5,
                   'ANDROID': 5,
                   'PSP': 6,
                   'psp': 6,
                   'Psp': 6,
                   'BD': 7,
                   'КНИГА': 8,
                   'mp3': 9,
                   'N1NTENDО': 10,
                   'CD\+\DVD': 11,
                   'windows': 12,
                   'Windows': 12,
                   'WINDOWS': 12,
                   'linux': 13,
                   'Linux': 13,
                   'LINUX': 13
                   
}

In [211]:
item_type_check_correction = {'Machine': 0,
                              '[0-9]CD': 11
                             }

In [205]:
for platform, category in item_type_check.items():
    unique_item_name_df.item_category[unique_item_name_df.item_name.str.contains(platform, 
                                                                                 regex=True)] = category

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [209]:
for platform, category in item_type_check_correction.items():
    unique_item_name_df.item_category[unique_item_name_df.item_name.str.contains(platform,
                                                                                 case = False,
                                                                                 regex=True)] = category

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [212]:
unique_item_name_df[unique_item_name_df.item_name.str.contains('mac', case=False, regex=True)].head(1000)

Unnamed: 0,item_name,item_category
1003,DEPECHE MODE Delta Machine Deluxe Edition 2...,0
1110,FLORENCE AND THE MACHINE Ceremonials,0
1623,DEPECHE MODE Delta Machine,0
3066,Spore (рус.в.) (PC&Mac) (PC-DVD) (Jewel),2
5697,Йо-йо AERO MACHETTE,2
5825,PS3: Turtle Beaсh. EarForce P11. Геймерская ст...,4
5841,PS3: Turtle Beaсh. EarForce PLa. Геймерская ст...,4
5928,X360: Turtle Beaсh. EarForce X12. Геймерская с...,3
6748,Мягкая игрушка Adventure Time Jake Slamacow со...,0
6749,Мягкая игрушка Adventure Time Finn Slamacow со...,0


In [200]:
unique_item_name_df[unique_item_name_df.item_category == 0].shape

(14132, 2)

(14132, 2)  (13389, 2)

In [156]:
unique_item_name_df[unique_item_name_df.item_category == 0].head(1000)

Unnamed: 0,item_name,item_category
134,БЕЗ ЛИЦА/ВОЗДУШНАЯ ТЮРЬМА/УГНАТЬ ЗА 60 СЕКУНД ...,0
957,OST Pulp Fiction LP,0
958,KRALL DIANA Very Best Of 2LP,0
959,SANTANA Santana LP,0
960,METALLICA ...And Justice For All 2LP,0
961,НИКОЛЬСКИЙ КОНСТАНТИН Иллюзии LP,0
962,METALLICA Master Of Puppets LP,0
963,ADELE 21 LP,0
964,CLAPTON ERIC Old Sock 2LP,0
965,HOOKER JOHN LEE Anthology 2LP,0


In [115]:
unique_item_name_df.iloc[2909:2911]

Unnamed: 0,item_name,item_category
2909,Battlefield 4 Premium. Сборник дополнений (код...,0
2910,"Call of Duty. Ghosts + Black Ops II [PC, Jewel...",1


In [None]:
test = []
global k
k = -1
def iterOver_unique_item_name(k):
    try:
        for i in range(k, len(unique_item_name)):
            k = k + 1
            print(k)
            test.append(translator.translate(unique_item_name[i]).text)
            
    except:
        test.append(unique_item_name[k-1])
        k = k + 1
        print(k)
        iterOver_unique_item_name(k)
        pass
        
iterOver_unique_item_name(k)

In [None]:
unique_item_name[79]

In [None]:
test

In [None]:
translatedEnglishList_item_name = []
for i in translatedList_item_name:
    translatedEnglishList_item_name.append((i.origin, i.text))
translatedEnglishList_item_namedf = pd.DataFrame({"shop_name": [translatedEnglishList_item_name[k][0] for k in range(len(translatedEnglishList_item_name))],
                                       "en_shop_name": [translatedEnglishList_item_name[k][1] for k in range(len(translatedEnglishList_item_name))]
                                       })

In [None]:
translatedEnglishList_item_namedf[translatedEnglishList_item_namedf['shop_name'] == \
                                  translatedEnglishList_item_namedf['en_shop_name']]