In [22]:
import pandas as pd

df = pd.read_excel('../data/cleaned_v2.xlsx')
df = df.set_index('Department')

In [25]:
df.groupby(df.index).count()

Unnamed: 0_level_0,Description
Department,Unnamed: 1_level_1
Building Permission,2119
Drainage,2132
Electrical,4037
Encroachment,1470
Garbage,16932
Garden,1653
Health,2802
Property Tax,1491
Road,5510
Stray Dogs,1405


In [100]:
df_for_training = pd.DataFrame()
SAMPLES = 1400
STATE = 2

In [101]:
for i in list(set(df.index)):
    df_for_training = df_for_training.append(df.loc[i].sample(n=SAMPLES,random_state=STATE))

In [102]:
df_for_training['class_label'] = df_for_training.index.factorize()[0]

In [103]:
df_for_training

Unnamed: 0_level_0,Description,class_label
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Property Tax,There are total 4 preprimary schools / Day car...,0
Property Tax,As 15 working days complete to submitted docum...,0
Property Tax,Both the flats are on same floor. Then why the...,0
Property Tax,FLAT NO CORRECTION,0
Property Tax,SEVERAL EMAILS SEND / NOT ONE REPLY / V POOR...,0
...,...,...
Garbage,Pigs Lying opposite the society premises all d...,10
Garbage,Sweeping not done,10
Garbage,Garbage dump,10
Garbage,Public toilet(s) cleaning,10


In [104]:
mapper = {}
for i,j in enumerate(list(set(df_for_training.index))):
    print(i,j)
    mapper[i] = j

0 Property Tax
1 Drainage
2 Building Permission
3 Garden
4 Stray Dogs
5 Encroachment
6 Water Supply
7 Health
8 Electrical
9 Road
10 Garbage


In [105]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manpreet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [106]:
stemmer = PorterStemmer()
words = stopwords.words("english")
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True)

In [107]:
X = df_for_training.copy() # Deep copy
X.columns = ['text','label']
X.index.name = None
X = X.reset_index()
X = X[['text','label']]
X = X.sample(frac=1)

In [108]:
X

Unnamed: 0,text,label
5153,Request to send a notice to society to get it ...,3
2038,CHOKED DRAINAGE LINE,1
6381,Stray Dogs Nuisance,4
4749,Pls have this tree uprooted b4 it falls & kill...,3
11147,DENGUE CASE TO YEARS OLD BOY DUE TO 1. OPEN...,7
...,...,...
7353,After 7:3 pm footpath outside DMart Baner Road...,5
879,Hi,0
8878,last 3 days WATER from corporation has not rea...,6
9727,No water at all today 3. 1. 1,6


In [109]:
import re
X['cleaned'] = X['text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [110]:
x = X['cleaned']
y = X['label']

In [111]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [112]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [122]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

import pickle

In [125]:
models = {
            'RandomForest' : RandomForestClassifier(n_estimators=400,n_jobs=4),
            'SVC' : LinearSVC(),
            'MultinomialNaiveBayes' : MultinomialNB(),
            'LogisticRegression' : LogisticRegression(max_iter=200)
         }

In [126]:
Y_test = np.array(y_test)

In [133]:
for model_name in list(models.keys()):
    pipeline = Pipeline([('vect', vectorizer),
                         ('chi',  SelectKBest(chi2, k=1000)),
                         ('clf', models[model_name])])

    model = pipeline.fit(X_train, y_train)
    with open(f'{model_name}.pickle', 'wb') as f:
        pickle.dump(model, f)


    print(model_name)
    print(confusion_matrix(Y_test, model.predict(X_test)))
    print(classification_report(Y_test, model.predict(X_test)))
    print("Accuracy score: ","{:.2f}".format(model.score(X_test, Y_test) * 100))

RandomForest
[[260   3   9   1   0   7   2   0   0   1   0]
 [  2 212   3   6   0   2  19  27   1  15   6]
 [  7   3 212   3   0  30   4   5   2   4   1]
 [  2   0   4 236   1   6   4   9   6   5   2]
 [  0   1   1   0 291   2   0   2   2   0   0]
 [  4   3  22   0   1 242   1   5   2  17   2]
 [  5   3   1   0   1   3 266   1   0  10   4]
 [  0  22  17   6  17   5   5 155   1   5  35]
 [  3   0   2   3   2   1   2   0 252   4   0]
 [  2  19   4   6   0  10   5   8   4 213   3]
 [  0   0   0   4   4   1   2   9   2   6 227]]
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       283
           1       0.80      0.72      0.76       293
           2       0.77      0.78      0.78       271
           3       0.89      0.86      0.87       275
           4       0.92      0.97      0.94       299
           5       0.78      0.81      0.80       299
           6       0.86      0.90      0.88       294
           7       0.70      0.58   

In [128]:
mapper

{0: 'Property Tax',
 1: 'Drainage',
 2: 'Building Permission',
 3: 'Garden',
 4: 'Stray Dogs',
 5: 'Encroachment',
 6: 'Water Supply',
 7: 'Health',
 8: 'Electrical',
 9: 'Road',
 10: 'Garbage'}