In [0]:
import pandas as pd
import re
import spacy   
import numpy as np
nlp = spacy.load('en_core_web_sm') 

In [0]:
data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Dissertation/all_nouns.csv").fillna("undefined")

In [0]:
def blend(X_train):
    final = []
    (rows, cols) = X_train.shape
    
    for row in range(rows):
        feature = ""
        for col in range(cols):
            feature += X_train.iloc[row][col] + " "
        final.append(feature)
    return final    


In [187]:
x = pd.concat([data["Noun Phrases"],data["Noun"],data["Propernoun"]],axis=1)
data["new"] = blend(x)
x = data["new"]
y = data["Labels"]
x.head(10)

0    planet_venus_starts new_year blaine_p_friedlan...
1    iowa_caucuses presidential_hopefuls tuesday_vo...
2    john_wall michael_lee john_wall locker_room br...
3    maryland_housing_office prince_georges financi...
4    southern_yemen sudarsan_raghavan southern_yeme...
5    jason_horowitz mitt_romney pam_arnold_powers r...
6    homicides_fall prince_georges allison_klein ma...
7    mike_shanahan right_direction barry_svrluga mi...
8    blue_jackets alex_ovechkin_scores katie_carrer...
9    warm_december martin_weil warm_december months...
Name: new, dtype: object

In [0]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
labels = oe.fit_transform(pd.DataFrame(y))

In [192]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42, shuffle=True)
# y_train.toarray().reshape(-1,1)
# y_test.toarray().reshape(-1,1)
print(X_train.shape)
print(y_train.shape)


(8000,)
(8000,)


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics


In [0]:
target_names = y.unique()
feature_names = []

In [0]:
def vectorize(vect,X_train,X_test,chisquare=False,n_features=10000):
    if vect == "hashing":
        vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
                                       n_features=2 ** 16)
        X_train = vectorizer.transform(X_train)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    if vect == "hashing" :
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()
    if chisquare:
        ch2 = SelectKBest(chi2, k=n_features)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        print("selected features using chi2")
    if feature_names:
        feature_names = np.asarray(feature_names)
        return X_train,X_test,feature_names
    return X_train,X_test

In [0]:
def benchmark(clf,print_top10 = True,print_report = False,num_keywords = -10):
    print("Training: ")
    print(clf)
    clf.fit(X_train, y_train)
    print("trained model")
    pred = clf.predict(X_test)
    print("predicted using model")

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                try:
                    top10 = np.argsort(clf.coef_[i])[num_keywords:]
                    print("%s: %s" % (label, " ".join(feature_names[top10])))
                except:
                    print("\n\n####\nSomething went wrong, '{}' label might not have 10 features in total".format(label))
                    # print(clf.coef_[i])

        print()

    if print_report:
        print("classification report:")
        print(metrics.classification_report(y_test, pred,
                                            target_names=target_names))
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score

In [0]:
print(X_train.shape)
print(y_train.shape)

X_train,X_test,feature_names = vectorize("tfidf",X_train,X_test,chisquare=False)
print(X_train.shape)
print(y_train.shape)
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=100), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=100),
         "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (LogisticRegression(n_jobs=1,C=1e5,max_iter=100),"lr"),
        (RandomForestClassifier(), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

(8000,)
(8000,)
(8000, 338805)
(8000,)
Ridge Classifier
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None, solver='sag',
                tol=0.01)


  '"sag" solver requires many iterations to fit '


trained model
predicted using model
accuracy:   0.713
dimensionality: 338805
density: 0.999964
top 10 keywords per class:
Local
: laurel ct 18 grocery_store old_man lakes merchandise shoplifting feb_8_merchandise laurel_lakes_ct
Politics
: jan feb thefts break president_obama police romantic_recommendations states white_women undefined
Wizards/NBA
: jan property west east thefts motor hwy vehicle motor_vehicle_thefts jan_17_property
Middle East
: death i_dont patients comfort nursing belief wishes own_death ventilator worthwhile
Redskins/NFL
: mail loudoun 202 adoption_hours pet st_nw motor_vehicle_thefts jan_17_property undefined block
Capitals/NHL
: speech policies conservatives act discussion panel constitution connection panel_discussion us_constitution
Sports
: area border latino massachusetts attacks critics immigrants undefined flags american_flags
Colleges
: ohio measure health_care sen disclosure undefined point offices folks federal_offices
Magazine
: region ups accomplice ja