### Text Classification with ScikitLearn,SpaCy and Interpretation of ML Model with ELi5
   + Text Preprocessing with SpaCy
   + Classifying Text With Sklearn
   + Interpreting Model with Eli5
    

In [1]:
import pandas as pd
import numpy as np

In [2]:
import spacy

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

In [4]:
import string
punctuations = string.punctuation

In [5]:
from spacy.lang.en import English
parser = English()

In [6]:
stopwords = list(STOP_WORDS)

In [7]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [8]:
ex1 = "He was walking with the walker in the Wall he may had sat and run with the runner"

In [9]:
spacy_tokenizer(ex1)

['walking', 'walker', 'wall', 'sat', 'run', 'runner']

In [10]:
# Load ML Pkgs
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [11]:
!pip install eli5
import eli5

Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 5.2 MB/s 
Collecting jinja2>=3.0.0
  Downloading Jinja2-3.1.2-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 54.3 MB/s 
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107748 sha256=da4f7b8c5c9dec56541daa403920d5aef5f4d177e839923545ae14635c51b0b6
  Stored in directory: /root/.cache/pip/wheels/cc/3c/96/3ead31a8e6c20fc0f1a707fde2e05d49a80b1b4b30096573be
Successfully built eli5
Installing collected packages: jinja2, eli5
  Attempting uninstall: jinja2
    Found existing installation: Jinja2 2.11.3
    Uninstalling Jinja2-2.11.3:
      Successfully uninstalled Jinja2-2.11.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following 

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
# Load dataset
df=pd.read_excel("/content/drive/MyDrive/Bert/Release_Notes_v0_RawData.xlsx")

In [14]:
df.head()

Unnamed: 0,ID,Project_Name,Release_Number,Date,Label,Type,Release_Note
0,1.0,Mozilla Firefox,86.0,2021-02-23,new,new,Firefox now supports simultaneously watching m...
1,2.0,Mozilla Firefox,86.0,2021-02-23,fixed,fixed,Reader mode now works with local HTML pages. U...
2,3.0,Mozilla Firefox,86.0,2021-02-23,changed,changed,"On Linux and Android, the protection to mitiga..."
3,4.0,Mozilla Firefox,86.0,2021-02-23,other,developer,Inactive CSS tool is now showing a warning whe...
4,5.0,Mozilla Firefox,85.0.2,2021-02-09,fixed,fixed,Fixed a deadlock during startup (bug 1679933)


In [15]:
df.shape

(800, 7)

In [16]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

In [17]:
def clean_text(text):     
    return text.strip().lower()

In [18]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,5)) 
# classifier = LinearSVC()
classifier = SVC(C=150, gamma=2e-2, probability=True)

In [19]:
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
df['labels'] = pd.factorize(df.Label)[0]

In [22]:
X = df['Release_Note']
ylabels = df['labels']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [24]:
X_train

264    Column choice and sort order in "Search Messag...
615    Fixed a bug where log files wouldn't be create...
329    Due to changes in the Mozilla platform profile...
342                               Various security fixes
394    Default to using SSL for XMPP and IRC. This mi...
                             ...                        
71                                          Security fix
106    Developer Information\nWAMP-formatted WebSocke...
270                Calendar: Problems with WCAP provider
435    Fixed a bug where the "Save Replay" button wou...
102                                Developer Information
Name: Release_Note, Length: 640, dtype: object

In [25]:
X_train.shape

(640,)

In [26]:
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [27]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7faf7ed43310>),
                ('vectorizer',
                 CountVectorizer(ngram_range=(1, 5),
                                 tokenizer=<function spacy_tokenizer at 0x7fb002aac9e0>)),
                ('classifier', SVC(C=150, gamma=0.02, probability=True))])

In [28]:
X_test.shape

(160,)

In [29]:
X_test[7]

'Various bug fixes and new policies have been implemented in the latest version of Firefox. You can see more details in the\xa0Firefox for Enterprise 85 Release Notes.'

In [30]:
sample_prediction = pipe.predict(X_test)

In [31]:
sample_prediction 

array([2, 1, 1, 2, 3, 2, 4, 2, 1, 0, 1, 4, 1, 1, 4, 1, 1, 2, 2, 2, 2, 2,
       2, 2, 1, 3, 0, 1, 1, 1, 1, 3, 1, 1, 1, 2, 2, 1, 1, 1, 1, 0, 1, 1,
       2, 2, 0, 1, 2, 1, 1, 1, 2, 4, 1, 1, 3, 4, 1, 1, 2, 2, 1, 1, 1, 2,
       1, 1, 4, 1, 1, 4, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 2, 1, 1, 1,
       1, 2, 3, 1, 2, 3, 0, 1, 3, 4, 1, 1, 1, 2, 1, 0, 1, 1, 0, 2, 2, 1,
       1, 1, 1, 0, 1, 1, 2, 3, 3, 0, 2, 0, 3, 2, 1, 1, 2, 1, 1, 2, 1, 0,
       2, 1, 1, 1, 2, 2])

In [32]:
print("Accuracy Score:",pipe.score(X_test, y_test))

Accuracy Score: 0.725


In [33]:
y_test.values

array([2, 1, 1, 0, 3, 2, 4, 2, 1, 0, 1, 4, 2, 1, 4, 1, 1, 0, 2, 0, 2, 1,
       2, 2, 1, 3, 2, 1, 0, 1, 4, 3, 0, 1, 1, 0, 2, 1, 1, 1, 1, 0, 1, 1,
       2, 1, 0, 2, 2, 1, 4, 1, 2, 4, 1, 0, 3, 4, 0, 1, 2, 2, 1, 1, 1, 2,
       1, 1, 4, 1, 1, 4, 1, 0, 1, 3, 4, 1, 2, 1, 1, 3, 3, 1, 1, 1, 1, 0,
       4, 0, 1, 2, 2, 0, 2, 2, 2, 1, 2, 1, 1, 1, 1, 0, 3, 1, 0, 1, 2, 4,
       1, 0, 3, 0, 2, 3, 0, 1, 3, 4, 1, 1, 1, 3, 1, 2, 2, 1, 0, 2, 2, 2,
       2, 1, 1, 0, 0, 1, 3, 0, 3, 0, 2, 0, 3, 2, 1, 1, 2, 1, 1, 2, 1, 0,
       2, 1, 1, 1, 0, 2])

In [34]:
örnek = X_test.iloc[7]
print(örnek)

Fixed some FLV files not playing back properly with the media source,Fixed two crashes with the browser source,Fixed the names of monitors not showing up when using projectors,Fixed a crash with window capture


In [35]:
tahmin=pipe.predict([örnek])

if tahmin[0] ==0:
  print("new")
elif tahmin[0]==1:
  print("fixed")
elif tahmin[0]==2:
  print("changed")
elif tahmin[0]==3:
  print("other")
else:
  print("unresolved")

changed


In [36]:
for (sample,pred) in zip(y_test,sample_prediction):
    print(sample,"Prediction=>",pred)

2 Prediction=> 2
1 Prediction=> 1
1 Prediction=> 1
0 Prediction=> 2
3 Prediction=> 3
2 Prediction=> 2
4 Prediction=> 4
2 Prediction=> 2
1 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
4 Prediction=> 4
2 Prediction=> 1
1 Prediction=> 1
4 Prediction=> 4
1 Prediction=> 1
1 Prediction=> 1
0 Prediction=> 2
2 Prediction=> 2
0 Prediction=> 2
2 Prediction=> 2
1 Prediction=> 2
2 Prediction=> 2
2 Prediction=> 2
1 Prediction=> 1
3 Prediction=> 3
2 Prediction=> 0
1 Prediction=> 1
0 Prediction=> 1
1 Prediction=> 1
4 Prediction=> 1
3 Prediction=> 3
0 Prediction=> 1
1 Prediction=> 1
1 Prediction=> 1
0 Prediction=> 2
2 Prediction=> 2
1 Prediction=> 1
1 Prediction=> 1
1 Prediction=> 1
1 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
1 Prediction=> 1
2 Prediction=> 2
1 Prediction=> 2
0 Prediction=> 0
2 Prediction=> 1
2 Prediction=> 2
1 Prediction=> 1
4 Prediction=> 1
1 Prediction=> 1
2 Prediction=> 2
4 Prediction=> 4
1 Prediction=> 1
0 Prediction=> 1
3 Prediction=> 3
4 Prediction=> 4
0 Prediction=>