In [165]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [12]:
df = pd.read_csv('../Datasets/trainData/all-data.csv', header=None, names=['labels', 'Text'] )

In [13]:
df.head()

Unnamed: 0,labels,Text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [80]:
df.labels.value_counts()

neutral     2879
positive    1363
negative     604
Name: labels, dtype: int64

In [81]:
### Playing with the data

In [82]:
nlp = spacy.load('en_core_web_lg')

In [83]:
# playing with the first row

In [114]:
doc = nlp(df.iloc[0]['Text'])

In [147]:
if 'the' in stopwords:
    print(True)

True


In [150]:
stopwords = nlp.Defaults.stop_words
" ".join([token.lemma_ for token in doc if not token.is_stop])
    

'accord Gran , company plan production Russia , company grow .'

In [None]:
# Step 1 Lemmatization

In [106]:
def lemma(text):
    doc=nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [124]:
stopwords = nlp.Defaults.stop_words
stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [154]:
def stopword(text):
    doc=nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

In [158]:
def remove_punctuation(text):
    doc=nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

In [132]:
df['lemmaText'] = df['Text'].apply(lemma)


In [155]:
df['lemmaText'] = df['Text'].apply(stopword)

In [159]:
df['lemmaText'] = df['lemmaText'].apply(remove_punctuation)

In [160]:
df.head()

Unnamed: 0,labels,Text,lemmaText
0,neutral,"According to Gran , the company has no plans t...",accord Gran company plan production Russia com...
1,neutral,Technopolis plans to develop in stages an area...,"Technopolis plan develop stage area 100,000 sq..."
2,negative,The international electronic industry company ...,international electronic industry company Elco...
3,positive,With the new production plant the company woul...,new production plant company increase capacity...
4,positive,According to the company 's updated strategy f...,accord company update strategy year 2009 2012 ...


In [163]:
features = df[['lemmaText']]
labels = df[['labels']]

In [190]:
X_train, X_test, y_train, y_test = train_test_split(features,labels,test_size = 0.15, random_state=42)

In [191]:
X_test

Unnamed: 0,lemmaText
3207,company suppose deliver machinery veneer mill ...
1684,UNC Charlotte deploy SSH Tectia Connector enab...
1044,2009 Lee Man combine annual production capacit...
4145,` ` high figure european scale Noop say recall...
1538,Finland correspond service Alma Media Etuovi.c...
...,...
2073,h+_kan dahlstr+ � m head mobility service Teli...
1426,shopping center prime location right Myllypuro...
1444,check bid Deka Showroom Fortitude Valley
1720,seminar introduce modern clean technology prod...


In [170]:
print("Data distribution:\n- Train: {} \n- Test: {}".format(len(y_train),len(y_test)))

Data distribution:
- Train: 4119 
- Test: 727


In [192]:
def vectorize(data,tfidf_vect_fit):
    X_tfidf = tfidf_vect_fit.transform(data)
    words = tfidf_vect_fit.get_feature_names()
    X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
    X_tfidf_df.columns = words
    return(X_tfidf_df)

In [193]:
tfidf_vect = TfidfVectorizer()
tfidf_vect_fit=tfidf_vect.fit(X_train['lemmaText'])
X_train=vectorize(X_train['lemmaText'],tfidf_vect_fit)

In [194]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier()
scores = cross_val_score(rf,X_train,y_train.values.ravel(),cv=5)

In [195]:
print(scores)
scores.mean()

[0.72208738 0.72815534 0.72694175 0.75121359 0.7290401 ]


0.7314876310915547

In [196]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [197]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
parameters = {
    'n_estimators': [5,50,100],
    'max_depth': [2,10,20,None]
}

cv = GridSearchCV(rf,parameters)
cv.fit(X_train,y_train.values.ravel())
print_results(cv)

BEST PARAMS: {'max_depth': None, 'n_estimators': 100}

0.597 (+/-0.003) for {'max_depth': 2, 'n_estimators': 5}
0.595 (+/-0.001) for {'max_depth': 2, 'n_estimators': 50}
0.595 (+/-0.001) for {'max_depth': 2, 'n_estimators': 100}
0.629 (+/-0.017) for {'max_depth': 10, 'n_estimators': 5}
0.615 (+/-0.008) for {'max_depth': 10, 'n_estimators': 50}
0.614 (+/-0.004) for {'max_depth': 10, 'n_estimators': 100}
0.663 (+/-0.021) for {'max_depth': 20, 'n_estimators': 5}
0.657 (+/-0.023) for {'max_depth': 20, 'n_estimators': 50}
0.655 (+/-0.021) for {'max_depth': 20, 'n_estimators': 100}
0.693 (+/-0.03) for {'max_depth': None, 'n_estimators': 5}
0.731 (+/-0.02) for {'max_depth': None, 'n_estimators': 50}
0.733 (+/-0.024) for {'max_depth': None, 'n_estimators': 100}


In [198]:
cv.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [199]:
rf2 = RandomForestClassifier(n_estimators=50,max_depth=None)
rf2.fit(X_train, y_train.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [200]:
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [201]:
X_test

Unnamed: 0,lemmaText
3207,company suppose deliver machinery veneer mill ...
1684,UNC Charlotte deploy SSH Tectia Connector enab...
1044,2009 Lee Man combine annual production capacit...
4145,` ` high figure european scale Noop say recall...
1538,Finland correspond service Alma Media Etuovi.c...
...,...
2073,h+_kan dahlstr+ � m head mobility service Teli...
1426,shopping center prime location right Myllypuro...
1444,check bid Deka Showroom Fortitude Valley
1720,seminar introduce modern clean technology prod...


In [202]:
X_test=vectorize(X_test['lemmaText'],tfidf_vect_fit)

y_pred = rf2.predict(X_test)
accuracy = round(accuracy_score(y_test,y_pred), 3)

In [203]:
accuracy

0.766