In [None]:
import pandas as pd
import numpy as np

In [None]:
import spacy

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')

In [None]:
import string
punctuations = string.punctuation

In [None]:
from spacy.lang.en import English
parser = English()

In [None]:
stopwords = list(STOP_WORDS)

In [None]:
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [None]:
ex1 = "He was walking with the walker in the Wall he may had sat and run with the runner"

In [None]:
spacy_tokenizer(ex1)

['walking', 'walker', 'wall', 'sat', 'run', 'runner']

In [None]:
# Load ML Pkgs
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [44]:
# Load dataset
df=pd.read_excel("/content/drive/MyDrive/Bert/hello.xlsx")

In [45]:
df.head()

Unnamed: 0,Release_Number,Label,Date,Release_Note
0,97.0,New,"February 8, 2022",Firefox now supports and displays the new styl...
1,97.0,Fixed,"February 8, 2022","On macOS, we’ve made improvements to system fo..."
2,97.0,Fixed,"February 8, 2022",Various security fixes
3,97.0,Changed,"February 8, 2022","On February 8, we will be expiring the 18 colo..."
4,97.0,Changed,"February 8, 2022",Support for directly generating PostScript for...


In [46]:
df.shape

(1333, 4)

In [47]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

In [48]:
def clean_text(text):     
    return text.strip().lower()

In [49]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,5)) 
# classifier = LinearSVC()
classifier = SVC(C=150, gamma=2e-2, probability=True)

In [50]:
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
df['labels'] = pd.factorize(df.Label)[0]

In [53]:
X = df['Release_Note']
ylabels = df['labels']

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [55]:
X_train

1280    \n    Keep track of articles and videos with P...
1110    \n    Add-on signing certificate expiration (B...
966     \n    Use Nirmala UI as fallback font for addi...
598     \n    Enabled AV1 support on 32-bit Windows an...
170     Video controls now have visible focus styling ...
                              ...                        
1095    \n    The Firefox click-to-activate plugin whi...
1130    \n    Fix a potential performance regression (...
1294    \n    Firefox may become unresponsive after ri...
860     \n    Added support for address form autofill ...
1126    \n    Fix an issue which could cause the list ...
Name: Release_Note, Length: 1066, dtype: object

In [56]:
X_train.shape

(1066,)

In [57]:
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [58]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7faf7e116790>),
                ('vectorizer',
                 CountVectorizer(ngram_range=(1, 5),
                                 tokenizer=<function spacy_tokenizer at 0x7fb002aac9e0>)),
                ('classifier', SVC(C=150, gamma=0.02, probability=True))])

In [59]:
X_test.shape

(267,)

In [62]:
X_test[44]

'\n    We’re rolling out the Firefox Multi-Account Containers extension with Mozilla VPN integration. This lets you use a different server location for each container.\n    \n  '

In [63]:
sample_prediction = pipe.predict(X_test)

In [64]:
sample_prediction 

array([1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 2, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 2, 0, 2, 1, 1, 0, 2, 1, 1, 1, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 2, 2, 0, 1, 0,
       1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 2, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 2, 1, 1, 0, 0, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0])

In [65]:
print("Accuracy Score:",pipe.score(X_test, y_test))

Accuracy Score: 0.8277153558052435


In [66]:
y_test.values

array([1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 2,
       1, 1, 0, 1, 2, 1, 1, 0, 1, 2, 0, 1, 0, 2, 2, 0, 0, 0, 0, 0, 0, 1,
       2, 2, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 2, 2, 0, 0, 1, 1, 0, 1, 2, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 2, 0, 1, 1, 2,
       0, 2, 0, 2, 1, 1, 0, 2, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 1, 1, 1, 0, 2, 2, 2, 1, 1,
       1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 2, 1, 1, 1,
       1, 0, 1, 1, 0, 2, 2, 1, 1, 0, 2, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 2, 0, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 1, 2, 1, 1, 2, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 2, 2, 1, 1, 0, 2, 2, 0, 2, 0, 1, 1, 1, 1, 0, 2, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

In [67]:
örnek = X_test.iloc[7]
print(örnek)


    Fixed Picture-in-Picture controls being visible on audio-only page elements (bug 1666775)
    
  


In [68]:
tahmin=pipe.predict([örnek])

if tahmin[0] ==0:
  print("new")
elif tahmin[0]==1:
  print("fixed")
elif tahmin[0]==2:
  print("changed")
elif tahmin[0]==3:
  print("other")
else:
  print("unresolved")

fixed


In [69]:
for (sample,pred) in zip(y_test,sample_prediction):
    print(sample,"Prediction=>",pred)

1 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
1 Prediction=> 1
1 Prediction=> 1
1 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
1 Prediction=> 1
0 Prediction=> 0
0 Prediction=> 0
0 Prediction=> 0
0 Prediction=> 0
1 Prediction=> 1
1 Prediction=> 0
0 Prediction=> 1
0 Prediction=> 0
2 Prediction=> 0
1 Prediction=> 1
1 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 0
2 Prediction=> 2
1 Prediction=> 1
1 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
2 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
0 Prediction=> 0
2 Prediction=> 1
2 Prediction=> 0
0 Prediction=> 0
0 Prediction=> 1
0 Prediction=> 0
0 Prediction=> 0
0 Prediction=> 0
0 Prediction=> 0
1 Prediction=> 0
2 Prediction=> 0
2 Prediction=> 0
0 Prediction=> 0
0 Prediction=> 0
0 Prediction=> 0
1 Prediction=> 1
0 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
1 Prediction=> 1
0 Prediction=> 0
1 Prediction=> 1
2 Prediction=> 2
2 Prediction=> 0
0 Prediction=>