In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import pickle
import pandas as pd
import warnings
import requests
from bs4 import BeautifulSoup
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Technology Sub.csv')

In [3]:
data = data.drop(columns=data.columns[[0]])

In [4]:
data = data.fillna("")

In [5]:
data.columns

Index(['post_text', 'post_flair', 'post_additional_url', 'URL Text'], dtype='object')

In [6]:
data = data.sample(frac=1).reset_index(drop=True)

In [7]:
data['URL Text'][105]

'Copyright © 2021, Quiller Media, Inc. Contact Us | Privacy PolicySomewhere along the line, Facebook moved from being the preposterously insecure social media service, into a corporation whose people seem pretty insecure —  which has been further demonstrated by the company on Thursday trying its latest attempt to convince people how privacy-breaking targeted advertisements are actually good for consumers. \rOn Thursday, we are yet again seeing a Facebook blogging tirade where the company wants to be seen as our lone champion against —  well, anything it can grab on to. If you can\'t be bothered to read this latest diatribe, the short version is that the company appears to believe that we need our online habits tracked in order to provide us with more ads, and we should be thankful that Facebook gives us that.\rThe even shorter version is that there\'s no difference from the last tirade —  except that the new blog has an accompanying ad campaign. It doesn\'t matter - Facebook knows you

In [8]:
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

data['post_text'] = data['post_text'].apply(lambda x:clean_text(x))
data['URL Text'] = data['URL Text'].apply(lambda x:clean_text(x))

flairs = data['post_flair'].value_counts().index

In [9]:
flairs

Index(['Space', 'Nanotech/Materials', 'Society', 'Transportation', 'Business',
       'Robotics/Automation', 'Networking/Telecom', 'Energy', 'Biotechnology',
       'Social', 'Hardware', 'Software', 'Politics', 'Privacy', 'Crypto',
       'Security', 'SECURITY', 'Machine', 'CRYPTO', 'PRIVACY', 'crypto',
       'politics', 'software', 'artificial', 'POLITICS', 'hardware',
       'security', 'HARDWARE'],
      dtype='object')

In [10]:
def logisticreg(X_train,y_train,X_test,y_test):
    
  from sklearn.linear_model import LogisticRegression

  logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=100, penalty='l2')),
                 ])
  logreg.fit(X_train, y_train)

  y_pred = logreg.predict(X_test)

  print(len(y_pred))

  print('accuracy %s' % accuracy_score(y_pred, y_test))
    

  return logreg
#  print(classification_report(y_test, y_pred,target_names=flairs))

In [11]:
def nb_classifier(X_train,y_train,X_test,y_test):
  nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
#  print(classification_report(y_test, y_pred,target_names=flairs))

In [12]:
## mlp
def mlpclassifier(X_train,y_train,X_test,y_test):  
  mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
  mlp.fit(X_train, y_train)

  y_pred = mlp.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
#  print(classification_report(y_test, y_pred,target_names=flairs))

In [13]:
def xgbclassifier(X_train,y_train,X_test,y_test):  
    xgb_clf = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6,eval_metric='merror',subsample=0.7,objective='multi:softmax')),
                 ])
    xgb_clf.fit(X_train, y_train)

    y_pred = xgb_clf.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
#    print(classification_report(y_test, y_pred,target_names=flairs))

In [14]:
def train_test(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 42)
   
#    print("Results of Logistic Regression")
    return logisticreg(X_train,y_train,X_test,y_test)
#    print("Results of XGB")
#    xgbclassifier(X_train,y_train,X_test,y_test)
#    print("Results of Naive Bayes")
#    nb_classifier(X_train,y_train,X_test,y_test)
#    print("Results of MLP")
#    mlpclassifier(X_train,y_train,X_test,y_test)
    
cat = data['post_flair']
V = data['URL Text']
W = data['post_text']
U = data['post_additional_url']

#print("______________________________________________\n")
#print("Flair Detection using URL link text as feature")
#train_test(U,cat)
#print("Flair Detection using URL page text as feature")
#train_test(V,cat)
#print("______________________________________________\n")
#print("Flair Detection using title text as feature")
#train_test(W,cat)

print("______________________________________________\n")
print("Flair Detection using ALL THREE as feature")
model = train_test(U+W+V,cat)
   

______________________________________________

Flair Detection using ALL THREE as feature
164
accuracy 0.5792682926829268


In [15]:
import joblib

In [29]:
V+W

0       assist professor chemistri washington univers ...
1                            🌊🍭🌊🍭🌊🍭🌊 rosalía social media
2       year fight depress final got courag post socia...
3       controversi facial recognit startup clearview ...
4       binanc smart chain centralis decentralis let d...
                              ...                        
1632    ÿøÿà jfifdoesn’t wanna friend block social med...
1633    seen interest robot design year allow adventur...
1634                                                 hack
1635    walmart unveil new initi aim store fulfil bigg...
1636    bank digit servic protect wealth cybersecur pr...
Length: 1637, dtype: object

In [26]:
filename = 'finalised_model.bin'
joblib.dump(model,filename)

['finalised_model.bin']