In [1]:
import pandas as pd
import re 

In [2]:
## check for labeling mechanism 
chunks = [
    "This is a <s>sample adv segment</s> of text.",
    "Another chunk <s>with an adv segment</s>.",
    "this is a chunk without adv segment",
    "This is the chunk with <s> atart here",
    "This is another chunk without the end.",
    "This is chunk with the end</s>.",
    "this is a chunk without adv segment"
]

example_chunk_df = pd.DataFrame({'chunks': chunks})
example_chunk_df['adv'] = 0
example_chunk_df

Unnamed: 0,chunks,adv
0,This is a <s>sample adv segment</s> of text.,0
1,Another chunk <s>with an adv segment</s>.,0
2,this is a chunk without adv segment,0
3,This is the chunk with <s> atart here,0
4,This is another chunk without the end.,0
5,This is chunk with the end</s>.,0
6,this is a chunk without adv segment,0


In [3]:
## easiest labeling, if there is at lease one wl from adv segment in the chunk, then the chunk is adv segment

for i, chunk in enumerate(example_chunk_df['chunks']):
    if '<s>' in chunk:
        example_chunk_df.loc[i, 'adv'] = 1
        if '</s>' not in chunk and i+1 < len(example_chunk_df['chunks']):
            j = i+1
            while j < len(example_chunk_df['chunks']) and '</s>' not in example_chunk_df['chunks'][j]:
                example_chunk_df.loc[j, 'adv'] = 1
                j += 1
            if j < len(example_chunk_df['chunks']):
                example_chunk_df.loc[j, 'adv'] = 1

example_chunk_df

Unnamed: 0,chunks,adv
0,This is a <s>sample adv segment</s> of text.,1
1,Another chunk <s>with an adv segment</s>.,1
2,this is a chunk without adv segment,0
3,This is the chunk with <s> atart here,1
4,This is another chunk without the end.,1
5,This is chunk with the end</s>.,1
6,this is a chunk without adv segment,0


In [2]:
## downloading real data 
sent = pd.read_csv('sentences_5000.csv')
sent2 = pd.read_csv('sentences2_5000.csv')

### combining two big dfs 
data = pd.concat([sent, sent2], axis = 0)

data['adv'] = 0

In [8]:
### labelling function 

def stupid_labeling(data):
    for i, chunk in enumerate(data['sentence']):
            if '<s>' in chunk:
                data.loc[i, 'adv'] = 1
                if '</s>' not in chunk and i+1 < len(data['sentence']):
                    j = i+1
                    while j < len(data['sentence']) and '</s>' not in data['sentence'][j]:
                        data.loc[j, 'adv'] = 1
                        j += 1
                    if j < len(data['sentence']):
                        data.loc[j, 'adv'] = 1
    return data 

In [9]:
### filtering out empty sells 
data = data[data['sentence'].isna()==False]
data = data.reset_index(drop=True)
labeled_data = stupid_labeling(data)

In [11]:
print(len(labeled_data[labeled_data['adv']==1]))
print(len(labeled_data))

44543

In [40]:
### check
pd.set_option('display.max_colwidth', None)
#data[data['adv']==1][:30]

In [13]:
### after we classificated it I think we should get rid of the <s> and </s> tags
labeled_data['sentence'] = labeled_data['sentence'].apply(lambda x: re.sub(r'<\/?s>', '', x))

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

# Assuming labeled_data is your DataFrame with 'sentence' and 'label' columns
X = labeled_data['sentence']
y = labeled_data['adv']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_vec)

from sklearn.metrics import classification_report

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
## TP FP
## FN TN
print(conf_matrix)

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96    100953
           1       0.59      0.45      0.51      8866

    accuracy                           0.93    109819
   macro avg       0.77      0.71      0.74    109819
weighted avg       0.92      0.93      0.93    109819

[[98232  2721]
 [ 4891  3975]]


In [72]:
# Import RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler

# Resample the data
rus = RandomUnderSampler(random_state=42)

# Resample the data
X_res, y_res = rus.fit_resample(X_train_vec, y_train)

print(len(y_res))
print(len(y_train))

nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_res, y_res)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_vec)

from sklearn.metrics import classification_report

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print(conf_matrix)
## TP FP
## FN TN

71354
439272
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.27      0.43    100953
           1       0.10      0.91      0.18      8866

    accuracy                           0.33    109819
   macro avg       0.54      0.59      0.30    109819
weighted avg       0.90      0.33      0.41    109819

[[27616 73337]
 [  767  8099]]


In [91]:
y_pred_prob = nb_classifier.predict_proba(X_test_vec)

positive_class_probs = y_pred_prob[:, 1]

# Set a custom threshold for predicting positive class
custom_threshold = 0.8
  # Adjust this threshold as needed

# Predict positive class based on probabilities and custom threshold
y_pred_custom_threshold = (positive_class_probs >= custom_threshold).astype(int)

# Classification report with custom threshold
print("Classification Report with custom threshold:")
print(classification_report(y_test, y_pred_custom_threshold))

# Calculate the confusion matrix with custom threshold
conf_matrix_custom_threshold = confusion_matrix(y_test, y_pred_custom_threshold)
print("Confusion Matrix with custom threshold:")
print(conf_matrix_custom_threshold)

Classification Report with custom threshold:
              precision    recall  f1-score   support

           0       0.96      0.69      0.80    100953
           1       0.17      0.70      0.27      8866

    accuracy                           0.69    109819
   macro avg       0.57      0.70      0.54    109819
weighted avg       0.90      0.69      0.76    109819

Confusion Matrix with custom threshold:
[[69759 31194]
 [ 2617  6249]]


In [92]:
# Import RandomOverSampler
from imblearn.over_sampling import RandomOverSampler

# Resample the data
rus = RandomOverSampler(random_state=42)

# Resample the data
X_res, y_res = rus.fit_resample(X_train_vec, y_train)

print(len(y_res))
print(len(y_train))


nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_res, y_res)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_vec)


from sklearn.metrics import classification_report

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print(conf_matrix)
## TP FP
## FN TN


807190
439272
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.52      0.68    100953
           1       0.13      0.82      0.23      8866

    accuracy                           0.55    109819
   macro avg       0.55      0.67      0.45    109819
weighted avg       0.90      0.55      0.64    109819

[[52718 48235]
 [ 1622  7244]]


In [95]:
y_pred_prob = nb_classifier.predict_proba(X_test_vec)

positive_class_probs = y_pred_prob[:, 1]

# Set a custom threshold for predicting positive class
custom_threshold = 0.8
  # Adjust this threshold as needed

# Predict positive class based on probabilities and custom threshold
y_pred_custom_threshold = (positive_class_probs >= custom_threshold).astype(int)

# Classification report with custom threshold
print("Classification Report with custom threshold:")
print(classification_report(y_test, y_pred_custom_threshold))

# Calculate the confusion matrix with custom threshold
conf_matrix_custom_threshold = confusion_matrix(y_test, y_pred_custom_threshold)
print("Confusion Matrix with custom threshold:")
print(conf_matrix_custom_threshold)

Classification Report with custom threshold:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94    100953
           1       0.38      0.56      0.45      8866

    accuracy                           0.89    109819
   macro avg       0.67      0.74      0.70    109819
weighted avg       0.91      0.89      0.90    109819

Confusion Matrix with custom threshold:
[[93053  7900]
 [ 3940  4926]]


In [28]:
### lets try tfidfs and suffer 
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as english_stop_words
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner', "attribute_ruler"])
### first we need to clean it 

def clean_and_join(doc):
    doc = nlp(doc)
    clean_tokens = [token.lemma_ for token in doc if not token.is_punct and token.text.lower() not in english_stop_words]
    clean_sentence = ' '.join(clean_tokens)
    return clean_sentence

In [29]:
#### takes ages I mean why do we need it 
data_log = data.copy()
data_log['sentence'] = data_log['sentence'].apply(clean_and_join)



KeyboardInterrupt: 