In [213]:
import pandas as pd
import re

In [214]:
df=pd.read_csv("Dataset/train_2kmZucJ.csv")
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [215]:
df.label.value_counts()

label
0    5894
1    2026
Name: count, dtype: int64

In [216]:
print(df[df.id==5]['tweet'].str.lower())

4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object


### Data Preprocessing

### 1)LowerCasing

In [217]:
df['tweet']=df['tweet'].str.lower()
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test https://goo.gl/h1...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...


### 2) Removing Links

In [218]:
def remove_links(s):
    pattern=re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r"",s)

In [219]:
df['tweet']=df['tweet'].apply(remove_links)
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test #android #apps #...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...


In [220]:
df.columns

Index(['id', 'label', 'tweet'], dtype='object')

In [221]:
df.drop("id",axis=1,inplace=True)
df.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #pregnancy test #android #apps #...
1,0,finally a transparant silicon case ^^ thanks t...
2,0,we love this! would you go? #talk #makememorie...
3,0,i'm wired i know i'm george i was made that wa...
4,1,what amazing service! apple won't even talk to...


### Handling Spellings

In [222]:
from symspellpy import SymSpell, Verbosity

In [223]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = "frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

True

In [224]:
def correct_spelling(text):
    corrected_words = []
    for word in text.split():
        # Get the best match for each word
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        # If a correction is found, take the first suggestion; otherwise, keep the word as is
        if suggestions:
            corrected_words.append(suggestions[0].term)
        else:
            corrected_words.append(word)
    return " ".join(corrected_words)

In [225]:
df['corrected_tweet'] = df['tweet'].apply(correct_spelling)

In [227]:
df.head()

Unnamed: 0,label,tweet,corrected_tweet
0,0,#fingerprint #pregnancy test #android #apps #...,fingerprint pregnancy test android apps beauti...
1,0,finally a transparant silicon case ^^ thanks t...,finally a transparent silicon case of thanks t...
2,0,we love this! would you go? #talk #makememorie...,we love this would you go talk #makememories u...
3,0,i'm wired i know i'm george i was made that wa...,i'm wired i know i'm george i was made that wa...
4,1,what amazing service! apple won't even talk to...,what amazing services apple won't even talk to...


### 3) Handling HashTags

In [228]:
def hashTagHandler(s):
    return re.findall(r'#(\w+)',s)

In [229]:
temp=df.iloc[0]['tweet']
results=hashTagHandler(temp)
results

['fingerprint',
 'pregnancy',
 'android',
 'apps',
 'beautiful',
 'cute',
 'health',
 'igers',
 'iphoneonly',
 'iphonesia',
 'iphone']

In [230]:
df["tweet_hashTags"]=df['corrected_tweet'].apply(hashTagHandler)
df.head()

Unnamed: 0,label,tweet,corrected_tweet,tweet_hashTags
0,0,#fingerprint #pregnancy test #android #apps #...,fingerprint pregnancy test android apps beauti...,"[iphoneonly, iphonesia]"
1,0,finally a transparant silicon case ^^ thanks t...,finally a transparent silicon case of thanks t...,"[xperia, sonyexperias]"
2,0,we love this! would you go? #talk #makememorie...,we love this would you go talk #makememories u...,"[makememories, connect]"
3,0,i'm wired i know i'm george i was made that wa...,i'm wired i know i'm george i was made that wa...,[daventry]
4,1,what amazing service! apple won't even talk to...,what amazing services apple won't even talk to...,[]


### 4) Removing Hashtags from original tweets

In [231]:
def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

In [232]:
df['corrected_tweet']=df['corrected_tweet'].apply(remove_hashtags)
df.head()

Unnamed: 0,label,tweet,corrected_tweet,tweet_hashTags
0,0,#fingerprint #pregnancy test #android #apps #...,fingerprint pregnancy test android apps beauti...,"[iphoneonly, iphonesia]"
1,0,finally a transparant silicon case ^^ thanks t...,finally a transparent silicon case of thanks t...,"[xperia, sonyexperias]"
2,0,we love this! would you go? #talk #makememorie...,we love this would you go talk unplug relax i...,"[makememories, connect]"
3,0,i'm wired i know i'm george i was made that wa...,i'm wired i know i'm george i was made that wa...,[daventry]
4,1,what amazing service! apple won't even talk to...,what amazing services apple won't even talk to...,[]


### 5) Removing puntuations

In [233]:
import string

In [234]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def remove_puntuations(s):
    return re.sub(r'[^\w\s]', '', s)
    

In [235]:
df['corrected_tweet']=df['corrected_tweet'].apply(remove_puntuations)
df.head()

Unnamed: 0,label,tweet,corrected_tweet,tweet_hashTags
0,0,#fingerprint #pregnancy test #android #apps #...,fingerprint pregnancy test android apps beauti...,"[iphoneonly, iphonesia]"
1,0,finally a transparant silicon case ^^ thanks t...,finally a transparent silicon case of thanks t...,"[xperia, sonyexperias]"
2,0,we love this! would you go? #talk #makememorie...,we love this would you go talk unplug relax i...,"[makememories, connect]"
3,0,i'm wired i know i'm george i was made that wa...,im wired i know im george i was made that way ...,[daventry]
4,1,what amazing service! apple won't even talk to...,what amazing services apple wont even talk to ...,[]


### Stopwords removal

In [None]:

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prathik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
def remove_stopwords(sent):
    new_sent=[]

    for word in sent.split():
        if word not in stopwords.words("english"):
            new_sent.append(word)
        else:
            new_sent.append('')
        
    return " ".join(new_sent)

In [236]:
df['corrected_tweet']=df['corrected_tweet'].apply(remove_stopwords)
df.head()

Unnamed: 0,label,tweet,corrected_tweet,tweet_hashTags
0,0,#fingerprint #pregnancy test #android #apps #...,fingerprint pregnancy test android apps beauti...,"[iphoneonly, iphonesia]"
1,0,finally a transparant silicon case ^^ thanks t...,finally transparent silicon case thanks un...,"[xperia, sonyexperias]"
2,0,we love this! would you go? #talk #makememorie...,love would go talk unplug relax iphone smar...,"[makememories, connect]"
3,0,i'm wired i know i'm george i was made that wa...,im wired know im george made way iphone c...,[daventry]
4,1,what amazing service! apple won't even talk to...,amazing services apple wont even talk que...,[]


In [237]:
df['hashtags_text'] = df['tweet_hashTags'].apply(lambda x: ' '.join(x))
df['bert_input_text'] = df['corrected_tweet'] + ' ' + df['hashtags_text']

df.head()

Unnamed: 0,label,tweet,corrected_tweet,tweet_hashTags,hashtags_text,bert_input_text
0,0,#fingerprint #pregnancy test #android #apps #...,fingerprint pregnancy test android apps beauti...,"[iphoneonly, iphonesia]",iphoneonly iphonesia,fingerprint pregnancy test android apps beauti...
1,0,finally a transparant silicon case ^^ thanks t...,finally transparent silicon case thanks un...,"[xperia, sonyexperias]",xperia sonyexperias,finally transparent silicon case thanks un...
2,0,we love this! would you go? #talk #makememorie...,love would go talk unplug relax iphone smar...,"[makememories, connect]",makememories connect,love would go talk unplug relax iphone smar...
3,0,i'm wired i know i'm george i was made that wa...,im wired know im george made way iphone c...,[daventry],daventry,im wired know im george made way iphone c...
4,1,what amazing service! apple won't even talk to...,amazing services apple wont even talk que...,[],,amazing services apple wont even talk que...


In [240]:
from sklearn.utils.class_weight import compute_class_weight
import torch

In [241]:
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=[0, 1], y=df['label'])
print(class_weights)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
print(class_weights_tensor.shape)

[0.6718697  1.95459033]
torch.Size([2])


In [247]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

In [248]:

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.config.class_weights = class_weights_tensor 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [249]:
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

In [252]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`