In [46]:
import pandas as pd
import numpy as np
import re
import  nltk
print(nltk.__version__)
import spacy
spacy.__version__
import pickle
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import string
import demoji

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

3.8.1


[nltk_data] Downloading package stopwords to /home/gillus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gillus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gillus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gillus/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [16]:
#spacy.cli.download("en_core_web_lg")

In [17]:
nlp = spacy.load("en_core_web_lg")

In [18]:
df = pd.read_csv("./train.tsv", sep="\t")
df

Unnamed: 0,id,text,label
0,12322,you need to stop the engine and wait until it ...,generated
1,1682,The Commission shall publish the report; an in...,generated
2,22592,"I have not been tweeting a lot lately, but I d...",generated
3,17390,I pass my exam and really thankgod for that bu...,human
4,30453,The template will have 3 parts: a mustache sha...,human
...,...,...,...
33840,16850,"@PierreJoye i have a server already, thanks fo...",human
33841,6265,10. Article 13 of Council Directive 80/777/EEC...,human
33842,11284,"Crying because I have to cry for you?. No. No,...",generated
33843,860,"However, it will continue to offer customers a...",generated


In [47]:


stopword_list = stopwords.words('english')
punctuations = string.punctuation

emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth      
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
    )"""

def remove_emojis(text):
    return demoji.replace(text, '')

def spacy_nlp_tokenizer(text):

    # substituting all space characters with a single space
    text = re.sub('\s+', ' ', text)
    # removing Url
    text = re.sub(r"\S*https?:\S*", "", text, flags=re.MULTILINE)
    #removing mention
    text = re.sub(r'@[\w]+', "", text, flags=re.MULTILINE)
    # remove textual emoji
    text = re.sub(emoticon_string,'',text, flags=re.MULTILINE)
    # remove graphical emoji
    text = remove_emojis(text) 
    
    doc = nlp(text)

    # lowering case
    tokens = [token.text.lower() for token in doc]
    
    # removing punctuations
    tokens_nopunct = [token for token in tokens if token not in punctuations]
    
    # creating ngrams
    tokens_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(tokens_nopunct,2)]
    tokens_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(tokens_nopunct,3)]

    all_tokens = list()
    all_tokens.extend(tokens_nopunct)
    all_tokens.extend(tokens_bigrams)
    all_tokens.extend(tokens_trigrams)
    return all_tokens

In [21]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=1)

In [22]:
len(X_train), len(y_train), len(X_test), len(y_test)

(27076, 27076, 6769, 6769)

In [23]:
vect=CountVectorizer(analyzer=spacy_nlp_tokenizer, min_df = 5)

In [24]:
print('fit')
# Just creating the features space. It defines the dimensions.
vect.fit(X_train) 
print('transform')
#Creating the vectors
X_train_tok = vect.transform(X_train) 
print('done')

X_test_tok =vect.transform(X_test)

fit
transform
done


In [25]:
vocabulary = vect

In [26]:
with open('./x_train_tok.pkl','wb') as outfile:
    pickle.dump(X_train_tok, outfile)
with open('./x_test_tok.pkl','wb') as outfile:
    pickle.dump(X_test_tok, outfile)

In [27]:
with open('./vocabulary.pkl','wb') as outfile:
    pickle.dump(vocabulary, outfile)

In [28]:
# with open('./x_train_tok.pkl',mode='br') as inputfile:
#     X_train_tok = pickle.load(inputfile)
# with open('./x_test_tok.pkl',mode='br') as inputfile:
#     X_test_tok = pickle.load(inputfile)
# with open('./vocabulary.pkl',mode='br') as inputfile:
#     vocabulary = pickle.load(inputfile)

## Logistic Regression

In [48]:
LR_pipeline = Pipeline([
    ('sel', SelectKBest(chi2, k=12000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LogisticRegression(max_iter=200))  # learning algorithm
])

LR_pipeline.fit(X_train_tok , y_train)
predictions = LR_pipeline.predict(X_test_tok)

print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

   generated       0.80      0.80      0.80      3428
       human       0.80      0.79      0.79      3341

    accuracy                           0.80      6769
   macro avg       0.80      0.80      0.80      6769
weighted avg       0.80      0.80      0.80      6769

Confusion matrix:
[[2757  671]
 [ 701 2640]]


In [30]:
tokenizer = vocabulary
selector = LR_pipeline.named_steps['sel']
classifier = LR_pipeline.named_steps['learner']

In [49]:
#questo mi restituisce le features prese in considerazione dal selectKBest

feature_names = tokenizer.get_feature_names_out() #Get output feature names for transformation.
feats_w_score = list()
#selector.get_support() mi dice quali features sono state prese in considerazione e quali no
#selector.score_: scores of features.
for index,(selected,score) in enumerate(zip(selector.get_support(),selector.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score)
len(feats_w_score)

68202

In [32]:
feats_w_score[:100],feats_w_score[-100:]

([(8.151873332063225e-09, False, 'BI_is_too'),
  (6.85936674506259e-06, False, 'BI_1_2'),
  (6.85936674506259e-06, False, 'directives'),
  (8.221823892670018e-06, False, 'shampoo'),
  (1.781712243222258e-05, False, 'BI_for_your'),
  (6.160062629522848e-05, False, 'BI_ _yeah'),
  (6.160062629522848e-05, False, 'BI_agreement_between'),
  (7.466459240969449e-05, False, 'BI_back_and'),
  (7.466459240969449e-05, False, 'BI_may_take'),
  (7.466459240969449e-05, False, 'TRI_should_be_able'),
  (0.00010764622918458291, False, 'TRI_good_idea_to'),
  (0.00012320125259045696, False, 'physical'),
  (0.00013563426011169307, False, 'TRI_the_committee_of'),
  (0.00014932918481938898, False, 'supply'),
  (0.0002171871615416713, False, 'BI_account_of'),
  (0.0002330712579415054, False, 'TRI_in_front_of'),
  (0.00027841003867685055, False, 'BI_until_the'),
  (0.0003983594152084282, False, 'BI_in_front'),
  (0.00043409536403958224, False, 'car'),
  (0.0004494157642345621, False, 'BI_agreement_is'),
  (0.

In [50]:
#queste sono le features usate nella classificazione

feats_w_classifier_weight = list()
for index,weight in enumerate(selector.inverse_transform(classifier.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

12000

In [51]:
feats_w_classifier_weight[:10]

[(-2.57151643594633, 'paragraph'),
 (-2.500703489305884, 'BI_the_following'),
 (-2.3753008207803012, 'additionally'),
 (-2.2411667081693643, 'BI_2_the'),
 (-2.1239990912445834, 'BI_european_union'),
 (-2.1180930850120423, 'BI_3_the'),
 (-2.1040733895318424, 'TRI_for_the_council'),
 (-2.078102594355812, 'TRI_for_the_european'),
 (-2.0527613413743433, 'BI_are_not'),
 (-2.026392585855323, 'pi')]

In [52]:
feats_w_classifier_weight[-10:]

[(2.770623107649165, 'in'),
 (2.839285729875487, 'sad'),
 (2.9050440227926018, 'BI_this_decision'),
 (2.9402607404150194, 'though'),
 (2.9982468580188377, 'your'),
 (3.007231692553673, 'brussels'),
 (3.1132239081676083, 'BI_at_brussels'),
 (3.1311884150904725, 'TRI_done_at_brussels'),
 (3.269741707247101, 'BI_done_at'),
 (3.387542405630033, 'amp')]