### Import required modules

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import pickle
from nltk.corpus import stopwords

### Read training and testing data from local files

In [2]:
train_data_df = pd.read_csv('dataset/Train.csv')
test_data_df = pd.read_csv('dataset/Test.csv')

In [3]:
train_data_df.head(10)

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,15001,VENDOR-1676,GL-6100410,83.24,Artworking/Typesetting Production Jun 2009 Cha...,CLASS-1963
1,15002,VENDOR-1883,GL-2182000,51.18,Auto Leasing Corporate Services Corning Inc /N...,CLASS-1250
2,15004,VENDOR-1999,GL-6050100,79.02,Store Management Lease/Rent Deltona Corp Real ...,CLASS-1274
3,15005,VENDOR-1771,GL-6101400,48.5,Store Construction General Requirements Coloni...,CLASS-1522
4,15006,VENDOR-1331,GL-2182000,63.35,Jul 2015 Aydin Corp Contingent Labor/Temp Labo...,CLASS-1376
5,15007,VENDOR-2076,GL-6101400,32.28,Final Site Clean Up 2018Mar Store Construction...,CLASS-1522
6,15009,VENDOR-1802,GL-6050310,5.38,Travel and Entertainment Miscellaneous Company...,CLASS-1758
7,15010,VENDOR-1191,GL-6101400,31.21,General Contractor General Requirements Final ...,CLASS-1522
8,15011,VENDOR-2120,GL-6100410,42.89,Aquila Distributors Inc ...,CLASS-1963
9,15012,VENDOR-1704,GL-6050100,59.5,Base Rent Store Management Chicago Rivet & Mac...,CLASS-1274


In [4]:
test_data_df.head(10)

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description
0,15003,VENDOR-2513,GL-6050310,56.13,Travel and Entertainment Miscellaneous Company...
1,15008,VENDOR-1044,GL-6101400,96.56,Final Site Clean Up Store Construction Advance...
2,15013,VENDOR-1254,GL-6101400,55.93,Arabian American Development Co Final Site Cle...
3,15019,VENDOR-1331,GL-2182000,32.62,Corporate Services Contingent Labor/Temp Labor...
4,15020,VENDOR-2513,GL-6050310,25.81,Fortune National Corp Miscellaneous Company Ca...
5,15022,VENDOR-2513,GL-6050310,22.71,Dec2007 Fortune National Corp Miscellaneous Co...
6,15024,VENDOR-1883,GL-2182000,47.38,Auto Leasing Corporate Services Corning Inc /N...
7,15026,VENDOR-2543,GL-6020600,26.08,Taxes Taxes Mar 2014 Frischs Restaurants Inc N...
8,15027,VENDOR-1944,GL-2182000,42.76,Daly John J Auto Fleet Repair and Maintenance ...
9,15028,VENDOR-2032,GL-6100500,70.47,SMAP Media Buy - Traditional Cgg Holding (U.S....


### Define function to clean data i.e to remove punctuations and stopwords

In [5]:
def clean_description(description):
    words_array = re.sub("[^a-zA-Z]", " ", description).lower().split()
    words_array = [word for word in words_array if word not in set(stopwords.words('english'))]
    return ' '.join(words_array)

### Apply funtion defined above on 'Item Description' column as it will be used to train classifier

In [6]:
train_data_df['Item_Description'] = train_data_df['Item_Description'].apply(clean_description)
test_data_df['Item_Description'] = test_data_df['Item_Description'].apply(clean_description)

In [7]:
train_data_df.head()

Unnamed: 0,Inv_Id,Vendor_Code,GL_Code,Inv_Amt,Item_Description,Product_Category
0,15001,VENDOR-1676,GL-6100410,83.24,artworking typesetting production jun champion...,CLASS-1963
1,15002,VENDOR-1883,GL-2182000,51.18,auto leasing corporate services corning inc ny...,CLASS-1250
2,15004,VENDOR-1999,GL-6050100,79.02,store management lease rent deltona corp real ...,CLASS-1274
3,15005,VENDOR-1771,GL-6101400,48.5,store construction general requirements coloni...,CLASS-1522
4,15006,VENDOR-1331,GL-2182000,63.35,jul aydin corp contingent labor temp labor con...,CLASS-1376


### Next few code blocks are used to analyze the frequency of words in 'Item Description' column
### It will be help us to decide number of features to train our classifier

In [8]:
tokens = [description.split() for description in train_data_df['Item_Description']] 
words = [word for token in tokens for word in token]

words_set = [word for word in words]

In [9]:
freq = nltk.FreqDist(words_set)
i= 1
for key, value in freq.most_common(200):
    print(str(i) + ". " + key + " - " + str(value))
    i = i + 1

1. miscellaneous - 3036
2. company - 3013
3. car - 2996
4. field - 2996
5. inc - 2132
6. services - 2011
7. rent - 1975
8. store - 1872
9. general - 1753
10. corporate - 1728
11. transportation - 1619
12. entertainment - 1499
13. travel - 1498
14. ground - 1498
15. corp - 1468
16. labor - 1460
17. auto - 1257
18. co - 1102
19. leasing - 1078
20. real - 1055
21. estate - 1055
22. management - 1023
23. lease - 987
24. base - 987
25. construction - 904
26. smap - 899
27. requirements - 851
28. contractor - 851
29. final - 851
30. site - 851
31. clean - 851
32. maintenance - 803
33. contingent - 730
34. temp - 730
35. production - 633
36. nov - 499
37. resources - 492
38. dec - 481
39. aug - 481
40. mar - 465
41. jul - 458
42. oct - 458
43. feb - 456
44. jun - 455
45. apr - 455
46. jan - 453
47. sep - 453
48. may - 452
49. national - 430
50. media - 405
51. human - 401
52. industries - 380
53. ny - 355
54. corning - 339
55. buy - 335
56. ltd - 318
57. agency - 308
58. bd - 306
59. trust - 

### Save the required data as training and testing respectively as pickle files

In [10]:
train_data_array= []
for i in range(len(train_data_df['Item_Description'])):
    train_data_array.append((train_data_df['Item_Description'][i], train_data_df['Product_Category'][i]))

save_train_data = open("pickled_data/train_data.pickle","wb")
pickle.dump(train_data_array, save_train_data)
save_train_data.close()

In [11]:
test_data_array= []
for i in range(len(test_data_df['Item_Description'])):
    test_data_array.append((test_data_df['Item_Description'][i], test_data_df['Inv_Id'][i]))

save_test_data = open("pickled_data/test_data.pickle","wb")
pickle.dump(test_data_array, save_test_data)
save_test_data.close()