# Setup Environment

In [None]:
import tensorflow as tf

tf.config.list_physical_devices('GPU')

In [None]:
pip uninstall tensorflow

In [None]:
pip install tensorflow==2.5.0

# Data Loading

In [None]:
pip install pandas

In [None]:
pip install torch

In [None]:
##from google.colab import drive
##drive.mount('/content/drive')

In [None]:
import pandas as pd

In [None]:
file_path = 'train_sentiment.csv'

In [None]:
DATASET_COLUMNS=['labels','id','Date','Flag','User','Text']
DATASET_ENCODING = "utf-8"
df = pd.read_csv(file_path, encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

In [None]:
df.head()

Unnamed: 0,labels,id,Date,Flag,User,Text
0,4,2000548391,Mon Jun 01 22:22:01 PDT 2009,NO_QUERY,ticia42,"@Z12 She can't open the door by herself, so I ..."
1,0,2191932827,Tue Jun 16 06:13:30 PDT 2009,NO_QUERY,lhotfoot,@inournuclearage
2,4,1754199174,Sun May 10 05:23:02 PDT 2009,NO_QUERY,hockeycrew,@jesthebes At least your lawn hasn't been take...
3,0,1994056674,Mon Jun 01 11:20:41 PDT 2009,NO_QUERY,GeoBlack_Cat,"umm.. like, hello? where's the child support p..."
4,4,1980150068,Sun May 31 05:51:31 PDT 2009,NO_QUERY,rawrcelne,Joined twitter


In [None]:
df.shape

(1000000, 6)

In [None]:
df.isna().sum()

labels    0
id        0
Date      0
Flag      0
User      0
Text      0
dtype: int64

In [None]:
df.columns

Index(['labels', 'id', 'Date', 'Flag', 'User', 'Text'], dtype='object')

## Duplications detections

In [None]:
duplicates = df[df.drop(columns=['labels']).duplicated()]

Get all rows which are duplications while ignoring labels. Only labels differ

In [None]:
duplicates

Unnamed: 0,labels,id,Date,Flag,User,Text
17116,0,1989814053,Mon Jun 01 02:54:18 PDT 2009,NO_QUERY,AlleX91,Uugh ... school time ...again Raining outsi...
17244,4,1677642549,Sat May 02 03:11:33 PDT 2009,NO_QUERY,torilovesbradie,@charlii1 awwww they are lovely i wish i had ...
43683,0,1978117620,Sat May 30 22:29:18 PDT 2009,NO_QUERY,Kutski,Singapore was OFF THE HOOK last night forced ...
47638,0,1793140987,Thu May 14 01:31:50 PDT 2009,NO_QUERY,StampfliTurci,@billingtonart I felt the same way as the weat...
56575,4,2175919778,Mon Jun 15 02:22:59 PDT 2009,NO_QUERY,TheNewBradie,@NessaSlashRice hiii it was ummm lonely SO MA...
...,...,...,...,...,...,...
996001,4,1979562113,Sun May 31 03:34:54 PDT 2009,NO_QUERY,hayleytrotter,Ahhh i think CIbulkova is about to beat Szavay...
997561,4,1974742920,Sat May 30 13:45:20 PDT 2009,NO_QUERY,KristineDulay,@TWITTAH_G I know everytime I see commercials...
998144,0,1978027912,Sat May 30 22:15:44 PDT 2009,NO_QUERY,lesleeyvonne,Yeah sarah!! At my bouse! lol. I misss Fresno.
999333,4,1687630113,Sun May 03 09:11:42 PDT 2009,NO_QUERY,raeraeverret,"Shreveport this week for 311, NOLA next week f..."


In [None]:
duplicates.labels.value_counts()

labels
4    348
0    312
Name: count, dtype: int64

In [None]:
df = df.drop(duplicates.index)

In [None]:
df.shape

(999340, 6)

## Group by users

In [None]:
df_filtered = df.groupby('User').filter(lambda x: len(x) > 1)

In [None]:
usernames = df_filtered['User'].value_counts()
usernames

User
lost_dog         340
webwoke          210
tweetpet         183
VioletsCRUK      176
mcraddictal      166
                ... 
alannahapple       2
Chittaranjan       2
MaliStack          2
DStringzzZ         2
BFlautista624      2
Name: count, Length: 165371, dtype: int64

In [None]:
df_filtered = df.groupby('User').filter(lambda x: len(x) == 1)

In [None]:
df.dtypes

labels     int64
id         int64
Date      object
Flag      object
User      object
Text      object
dtype: object

## Drop redundant Features

In [None]:
df.drop(columns=['Flag'], inplace=True)

In [None]:
df.dtypes

labels     int64
id         int64
Date      object
User      object
Text      object
dtype: object

In [None]:
df.labels.unique()

array([4, 0], dtype=int64)

In [None]:
#df['labels'] = df['labels'].replace(4,1)

In [None]:
df

Unnamed: 0,labels,id,Date,User,Text
0,4,2000548391,Mon Jun 01 22:22:01 PDT 2009,ticia42,"@Z12 She can't open the door by herself, so I ..."
1,0,2191932827,Tue Jun 16 06:13:30 PDT 2009,lhotfoot,@inournuclearage
2,4,1754199174,Sun May 10 05:23:02 PDT 2009,hockeycrew,@jesthebes At least your lawn hasn't been take...
3,0,1994056674,Mon Jun 01 11:20:41 PDT 2009,GeoBlack_Cat,"umm.. like, hello? where's the child support p..."
4,4,1980150068,Sun May 31 05:51:31 PDT 2009,rawrcelne,Joined twitter
...,...,...,...,...,...
999995,0,1985361990,Sun May 31 16:57:39 PDT 2009,lutheasalom,this song's middle change just doesn't want to...
999996,4,2057029784,Sat Jun 06 12:14:24 PDT 2009,beeluz,@officialnjonas Good luck with that
999997,0,1835639354,Mon May 18 06:26:21 PDT 2009,lordmuttley,@ProudGamerTweet I rather average 32370
999998,0,2246780174,Fri Jun 19 18:06:46 PDT 2009,MizSadittyFancy,Pickin up @misstinayao waitin on @sadittysash ...


 ## Text cultivation

In [None]:
standard_stopwords = [
    'a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'as', 'at',
    'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'by', 'can', 'd', 'did',
    'do', 'does', 'doing', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'has',
    'have', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if',
    'in', 'into', 'is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'more', 'most', 'my',
    'myself', 'now', 'o', 'of', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out',
    'own', 're', 's', 'same', 'she', "shes", 'should', "shouldve", 'so', 'some', 'such', 't', 'than',
    'that', "thatll", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they',
    'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'we', 'were',
    'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 'y', 'you',
    "youd", "youll", "youre", "youve", 'your', 'yours', 'yourself', 'yourselves'
]

In [None]:
twitter_stopwords = [
    'rt', 'via', 'http', 'https', 'www', 'u', 'us', 'im', 'dont', 'ive', 'youre', 'amp',
    '@', '#', '&'
]

In [None]:
custom_stopwords = set(standard_stopwords + twitter_stopwords)

In [None]:
important_words = {'not', 'no', 'very', 'xoxo', 'lol', 'omg', 'thx', 'haha'}

In [None]:
custom_stopwords = custom_stopwords - important_words

In [None]:
import re

In [None]:
def preprocess_tweet(tweet, stopwords):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    # Remove user @ references
    tweet = re.sub(r'\@\w+', '', tweet)
    # Tokenize tweet
    words = tweet.split()
    # Remove stopwords
    words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(words) if words else "EMPTY_TEXT"

In [None]:
df['Cleaned_Text'] = df['Text'].apply(lambda x: preprocess_tweet(x, custom_stopwords))

## Fix double encoding

In [None]:
def fix_double_encoding(text):
    try:
        # Encode the incorrectly decoded string back to bytes using 'latin1'
        byte_text = text.encode('latin1')
        # Decode it correctly using 'utf-8' with error handling
        return byte_text.decode('utf-8', errors='replace')
    except UnicodeEncodeError:
        return text if text else 'EMPTY_TEXT'

# Apply the function to fix encoding
df['Fixed_Text'] = df['Cleaned_Text'].apply(fix_double_encoding)

In [None]:
df_cleaned = df[['labels','id','Date','User', 'Fixed_Text']].copy()

In [None]:
df_cleaned

Unnamed: 0,labels,id,Date,User,Fixed_Text
0,4,2000548391,Mon Jun 01 22:22:01 PDT 2009,ticia42,"can't open door herself, think feels pain would."
1,0,2191932827,Tue Jun 16 06:13:30 PDT 2009,lhotfoot,EMPTY_TEXT
2,4,1754199174,Sun May 10 05:23:02 PDT 2009,hockeycrew,least lawn hasn't taken over field weeds!
3,0,1994056674,Mon Jun 01 11:20:41 PDT 2009,GeoBlack_Cat,"umm.. like, hello? where's child support payme..."
4,4,1980150068,Sun May 31 05:51:31 PDT 2009,rawrcelne,Joined twitter
...,...,...,...,...,...
999995,0,1985361990,Sun May 31 16:57:39 PDT 2009,lutheasalom,song's middle change doesn't want born..... ar...
999996,4,2057029784,Sat Jun 06 12:14:24 PDT 2009,beeluz,Good luck
999997,0,1835639354,Mon May 18 06:26:21 PDT 2009,lordmuttley,rather average 32370
999998,0,2246780174,Fri Jun 19 18:06:46 PDT 2009,MizSadittyFancy,Pickin waitin 2 hurry up...I odeeee missed dem...


In [None]:
empty_tweet = df_cleaned[df_cleaned["Fixed_Text"] == "EMPTY_TEXT"]
df_cleaned.drop(empty_tweet.index,inplace=True)

## Emoji extraction Note: This part was not used for modeling

In [None]:
df_cleaned.shape

(996715, 5)

In [None]:
pip install emoji

Note: you may need to restart the kernel to use updated packages.


In [None]:
import emoji

def extract_emojis(text):
    return ' '.join(c for c in text if emoji.is_emoji(c))

# Apply the function to extract emojis
df_cleaned['Emojis'] = df_cleaned['Fixed_Text'].apply(extract_emojis)

In [None]:
df_cleaned

Unnamed: 0,labels,id,Date,User,Fixed_Text,Emojis
0,4,2000548391,Mon Jun 01 22:22:01 PDT 2009,ticia42,"can't open door herself, think feels pain would.",
2,4,1754199174,Sun May 10 05:23:02 PDT 2009,hockeycrew,least lawn hasn't taken over field weeds!,
3,0,1994056674,Mon Jun 01 11:20:41 PDT 2009,GeoBlack_Cat,"umm.. like, hello? where's child support payme...",
4,4,1980150068,Sun May 31 05:51:31 PDT 2009,rawrcelne,Joined twitter,
5,0,2242922534,Fri Jun 19 12:48:06 PDT 2009,Whacky,Gayle wrong guy wrong team much like Brian Lar...,
...,...,...,...,...,...,...
999995,0,1985361990,Sun May 31 16:57:39 PDT 2009,lutheasalom,song's middle change doesn't want born..... ar...,
999996,4,2057029784,Sat Jun 06 12:14:24 PDT 2009,beeluz,Good luck,
999997,0,1835639354,Mon May 18 06:26:21 PDT 2009,lordmuttley,rather average 32370,
999998,0,2246780174,Fri Jun 19 18:06:46 PDT 2009,MizSadittyFancy,Pickin waitin 2 hurry up...I odeeee missed dem...,


In [None]:
df_cleaned[(df_cleaned['Emojis'] != "") & (df_cleaned['labels']==0)]

Unnamed: 0,labels,id,Date,User,Fixed_Text,Emojis
10813,0,2325953939,Thu Jun 25 06:19:48 PDT 2009,AlmaVienna,ps. wrote message myspace ;) But didnt wrote y...,♥
16879,0,2196976038,Tue Jun 16 13:10:38 PDT 2009,NileyLovers,hey back not back.. maybe internets off.. miss...,♥
30765,0,2256398984,Sat Jun 20 12:42:54 PDT 2009,LittleFloWer17,(: ♥ ♥ ...love not bad but it's little difficu...,♥ ♥ ♥ ♥
34239,0,2321444599,Wed Jun 24 21:03:55 PDT 2009,jesslee1331,good luck tonight =D♥ wish competitions coz wo...,♥
39762,0,2071449813,Sun Jun 07 19:24:59 PDT 2009,trashii,Wish weekend didn't end quickly lovelovelove♥ ...,♥
...,...,...,...,...,...,...
981578,0,2068113481,Sun Jun 07 13:31:17 PDT 2009,krissydietz,"go off, honey enough ! see tomorrow ! love ♥ t...",♥
982071,0,1997803602,Mon Jun 01 17:29:09 PDT 2009,rebeurka34,wish didn't say goodbye 2 months wcs ♥,♥
983550,0,2252191918,Sat Jun 20 05:41:26 PDT 2009,x_chiquita_x,going back vienna now... can't wait see roomma...,♥
994612,0,2322471006,Wed Jun 24 22:40:34 PDT 2009,sierrabardot,saved life. taught life lessons. yet i've stil...,☮ ♥


In [None]:
pd.set_option('display.max_colwidth', None)
df_cleaned['Fixed_Text'][ df_cleaned['id'] == 2325953939]

10813    ps. wrote message myspace ;) But didnt wrote yet Much Love,xoxo♥Alma
Name: Fixed_Text, dtype: object

## Remove tweets with garbled text

In [None]:
df_cleaned[df_cleaned['id'] == 2001157680]

Unnamed: 0,labels,id,Date,User,Fixed_Text,Emojis
10157,4,2001157680,Tue Jun 02 00:00:45 PDT 2009,d1g_cartoon,"الضرورات تبيح ال� حظورات: الضرورات تبيح ال� حظورات ...Author: أ� ير الا� ة Added: الثلاثاء, 02 يونيو, 2009 08..",


In [None]:
## Check garbled text in DF

def contains_non_ascii(text):
    try:
        text.encode('ascii')
    except UnicodeEncodeError as e:
        return True
    return False

# Apply the function to the DataFrame
df_cleaned['ContainsGarbledText'] = df_cleaned['Fixed_Text'].apply(contains_non_ascii)

In [None]:
df_cleaned[df_cleaned['ContainsGarbledText']== True]

Unnamed: 0,labels,id,Date,User,Fixed_Text,Emojis,ContainsGarbledText
7,4,1990245213,Mon Jun 01 04:19:15 PDT 2009,jedbackhouse,today's agenda... - ♦ food town ♦ 1.15pm concert Uni ♦ Greenhead park w/mates &amp; football ♦ drinks in...,♦ ♦ ♦ ♦,True
64,4,2176385388,Mon Jun 15 03:43:49 PDT 2009,Omertoso,#musicmonday Time - John Cena ♫ ♪ ♫ Rafaga ♪ ♫ ♪,,True
136,4,1694108452,Mon May 04 01:07:19 PDT 2009,bubbameadows,Video: Today�s video blog/vlog�thing.,,True
312,4,1997694855,Mon Jun 01 17:17:50 PDT 2009,lelialinden,"japanese restaraunt, drink saquê, order HAHA",,True
359,0,1966179531,Fri May 29 17:16:29 PDT 2009,FindingDani,"Hey boys, wanna see teeth? (damn it, twitter don�t wanna change picture)",,True
...,...,...,...,...,...,...,...
999624,4,2175611191,Mon Jun 15 01:28:56 PDT 2009,jemappellekim,Best cover version ever ♫,,True
999736,0,2204878136,Wed Jun 17 03:22:19 PDT 2009,LLinae,"ended even started. - wont long. 30th June, 13 days. I’m scared be...",,True
999822,4,1989354486,Mon Jun 01 01:14:01 PDT 2009,jangles,song help ♫,,True
999856,4,2014715682,Wed Jun 03 02:26:53 PDT 2009,lenseffect,"а по радиото какви �?а, аууу. ..",,True


In [None]:
df_cleaned.drop(df_cleaned[df_cleaned['ContainsGarbledText'] == True].index, inplace=True)

In [None]:
df_cleaned.drop(columns=['ContainsGarbledText'],inplace=True)

In [None]:
df_cleaned

Unnamed: 0,labels,id,Date,User,Fixed_Text,Emojis
0,4,2000548391,Mon Jun 01 22:22:01 PDT 2009,ticia42,"can't open door herself, think feels pain would.",
2,4,1754199174,Sun May 10 05:23:02 PDT 2009,hockeycrew,least lawn hasn't taken over field weeds!,
3,0,1994056674,Mon Jun 01 11:20:41 PDT 2009,GeoBlack_Cat,"umm.. like, hello? where's child support payment? Hope ex still gainfully employed - many people know currently not",
4,4,1980150068,Sun May 31 05:51:31 PDT 2009,rawrcelne,Joined twitter,
5,0,2242922534,Fri Jun 19 12:48:06 PDT 2009,Whacky,Gayle wrong guy wrong team much like Brian Lara #t20wc,
...,...,...,...,...,...,...
999995,0,1985361990,Sun May 31 16:57:39 PDT 2009,lutheasalom,song's middle change doesn't want born..... arghhhh!!,
999996,4,2057029784,Sat Jun 06 12:14:24 PDT 2009,beeluz,Good luck,
999997,0,1835639354,Mon May 18 06:26:21 PDT 2009,lordmuttley,rather average 32370,
999998,0,2246780174,Fri Jun 19 18:06:46 PDT 2009,MizSadittyFancy,Pickin waitin 2 hurry up...I odeeee missed dem Table talk 2nite...LOL bout fat...,


In [None]:
df_cleaned.shape

(987629, 6)

## Extract necessary information from Date: This part was not used for modeling

In [None]:
df['Date']

0         Mon Jun 01 22:22:01 PDT 2009
1         Tue Jun 16 06:13:30 PDT 2009
2         Sun May 10 05:23:02 PDT 2009
3         Mon Jun 01 11:20:41 PDT 2009
4         Sun May 31 05:51:31 PDT 2009
                      ...             
999995    Sun May 31 16:57:39 PDT 2009
999996    Sat Jun 06 12:14:24 PDT 2009
999997    Mon May 18 06:26:21 PDT 2009
999998    Fri Jun 19 18:06:46 PDT 2009
999999    Sun May 17 23:52:31 PDT 2009
Name: Date, Length: 999340, dtype: object

In [None]:
import pandas as pd
from dateutil import parser
from dateutil.tz import gettz

# Define the timezone mapping
tzinfos = {
    'PDT': gettz('America/Los_Angeles')
}

# Convert the 'Datetime' column to datetime with timezone awareness
df_cleaned['Date'] = df_cleaned['Date'].apply(lambda x: parser.parse(x, tzinfos=tzinfos))



#### Error will be fixed after kernel restart

In [None]:
# Extract features
df_cleaned['Year'] = df_cleaned['Date'].dt.year
df_cleaned['Month'] = df_cleaned['Date'].dt.month
df_cleaned['Day'] = df_cleaned['Date'].dt.day
df_cleaned['Hour'] = df_cleaned['Date'].dt.hour
df_cleaned['Minute'] = df_cleaned['Date'].dt.minute
df_cleaned['Second'] = df_cleaned['Date'].dt.second
df_cleaned['DayOfWeek'] = df_cleaned['Date'].dt.dayofweek
df_cleaned['IsWeekend'] = df_cleaned['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)  # Weekend if Saturday(5) or Sunday(6)
df_cleaned['TimeOfDay'] = df_cleaned['Hour'].apply(lambda x: 'Night' if 0 <= x < 6 else 'Morning' if 6 <= x < 12 else 'Afternoon' if 12 <= x < 18 else 'Evening')

DayOfWeek 0 -> Monday

In [None]:
df_cleaned

Unnamed: 0,labels,id,Date,User,Fixed_Text,Emojis,Year,Month,Day,Hour,Minute,Second,DayOfWeek,IsWeekend,TimeOfDay
0,4,2000548391,2009-06-01 22:22:01-07:00,ticia42,"can't open door herself, think feels pain would.",,2009,6,1,22,22,1,0,0,Evening
2,4,1754199174,2009-05-10 05:23:02-07:00,hockeycrew,least lawn hasn't taken over field weeds!,,2009,5,10,5,23,2,6,1,Night
3,0,1994056674,2009-06-01 11:20:41-07:00,GeoBlack_Cat,"umm.. like, hello? where's child support payment? Hope ex still gainfully employed - many people know currently not",,2009,6,1,11,20,41,0,0,Morning
4,4,1980150068,2009-05-31 05:51:31-07:00,rawrcelne,Joined twitter,,2009,5,31,5,51,31,6,1,Night
5,0,2242922534,2009-06-19 12:48:06-07:00,Whacky,Gayle wrong guy wrong team much like Brian Lara #t20wc,,2009,6,19,12,48,6,4,0,Afternoon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,1985361990,2009-05-31 16:57:39-07:00,lutheasalom,song's middle change doesn't want born..... arghhhh!!,,2009,5,31,16,57,39,6,1,Afternoon
999996,4,2057029784,2009-06-06 12:14:24-07:00,beeluz,Good luck,,2009,6,6,12,14,24,5,1,Afternoon
999997,0,1835639354,2009-05-18 06:26:21-07:00,lordmuttley,rather average 32370,,2009,5,18,6,26,21,0,0,Morning
999998,0,2246780174,2009-06-19 18:06:46-07:00,MizSadittyFancy,Pickin waitin 2 hurry up...I odeeee missed dem Table talk 2nite...LOL bout fat...,,2009,6,19,18,6,46,4,0,Evening


In [None]:
df_cleaned.Year.unique()

array([2009])

In [None]:
df_cleaned.Month.unique()

array([6, 5, 4])

In [None]:
df_cleaned.Day.unique()

array([ 1, 10, 31, 19,  6,  3, 30, 16,  5, 20, 14, 22, 29, 15, 18, 23, 28,
        2, 17,  7,  4, 21,  9, 11, 24, 13, 25, 26, 27])

In [None]:
df_cleaned.DayOfWeek.unique()

array([0, 6, 4, 2, 5, 1, 3])

In [None]:
df_cleaned.IsWeekend.unique()

array([0, 1], dtype=int64)

In [None]:
df_cleaned.drop(columns=['Date', 'Year', 'Month', 'Day', 'DayOfWeek'],inplace=True)

In [None]:
df_cleaned.drop(columns=['Hour', 'Minute', 'Second'],inplace=True)

In [None]:
df_cleaned

Unnamed: 0,labels,id,User,Fixed_Text,Emojis,IsWeekend,TimeOfDay
0,4,2000548391,ticia42,"can't open door herself, think feels pain would.",,0,Evening
2,4,1754199174,hockeycrew,least lawn hasn't taken over field weeds!,,1,Night
3,0,1994056674,GeoBlack_Cat,"umm.. like, hello? where's child support payment? Hope ex still gainfully employed - many people know currently not",,0,Morning
4,4,1980150068,rawrcelne,Joined twitter,,1,Night
5,0,2242922534,Whacky,Gayle wrong guy wrong team much like Brian Lara #t20wc,,0,Afternoon
...,...,...,...,...,...,...,...
999995,0,1985361990,lutheasalom,song's middle change doesn't want born..... arghhhh!!,,1,Afternoon
999996,4,2057029784,beeluz,Good luck,,1,Afternoon
999997,0,1835639354,lordmuttley,rather average 32370,,0,Morning
999998,0,2246780174,MizSadittyFancy,Pickin waitin 2 hurry up...I odeeee missed dem Table talk 2nite...LOL bout fat...,,0,Evening


In [None]:
dataset = df_cleaned.copy()

In [None]:
## Check garbled text in DF

def contains_non_ascii(text):
    try:
        text.encode('ascii')
    except UnicodeEncodeError as e:
        return True
    return False

# Apply the function to the DataFrame
dataset['ContainsGarbledText'] = dataset['Fixed_Text'].apply(contains_non_ascii)

In [None]:
dataset[dataset['ContainsGarbledText']== True]

Unnamed: 0,labels,id,User,Fixed_Text,Emojis,IsWeekend,TimeOfDay,ContainsGarbledText


In [None]:
dataset.drop(columns=['ContainsGarbledText'], inplace=True)

# Modeling

In [None]:
data.shape

NameError: name 'data' is not defined

In [None]:
data = dataset.copy()

In [None]:
data.columns

Index(['labels', 'id', 'User', 'Fixed_Text', 'Emojis', 'IsWeekend',
       'TimeOfDay'],
      dtype='object')

In [None]:
data = data[['Fixed_Text', 'labels']]
data = data.rename(columns={'Fixed_Text': 'text', 'labels': 'label'})

In [None]:
data['label'] = data['label'].apply(lambda x: 0 if x == 0 else 1)

## Libraries import

In [None]:
pip install transformers datasets torch

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install --upgrade transformers torch




In [None]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install --upgrade transformers accelerate

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install transformers==4.18.0 accelerate==0.7.0


In [None]:
pip show transformers accelerate

In [None]:
pip install transformers==4.18.0 accelerate==0.7.0

In [None]:
pip install transformers[torch] accelerate -U

In [None]:
pip install accelerate>=0.21.0

In [None]:
pip install sentencepiece


Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp310-cp310-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   - -------------------------------------- 30.7/991.5 kB 1.4 MB/s eta 0:00:01
   - -------------------------------------- 30.7/991.5 kB 1.4 MB/s eta 0:00:01
   -- ------------------------------------ 61.4/991.5 kB 409.6 kB/s eta 0:00:03
   --- ----------------------------------- 92.2/991.5 kB 525.1 kB/s eta 0:00:02
   ---- --------------------------------- 112.6/991.5 kB 547.6 kB/s eta 0:00:02
   ----- -------------------------------- 143.4/991.5 kB 568.9 kB/s eta 0:00:02
   ----- -------------------------------- 153.6/991.5 kB 573.4 kB/s eta 0:00:02
   ------- ------------------------------ 194.6/991.5 kB 622.7 kB/s eta 0:00:02
   -------- ----------------------------- 225.3/991.5 kB 655.6 kB/s eta 0:00:02
   --------- ----------------------------

## Cleaned data

## Distilbert

In [None]:
data.to_csv('saved_dataset.csv')

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('saved_dataset.csv')

In [None]:
data

Unnamed: 0.1,Unnamed: 0,text,label
0,0,"can't open door herself, think feels pain would.",1
1,2,least lawn hasn't taken over field weeds!,1
2,3,"umm.. like, hello? where's child support payme...",0
3,4,Joined twitter,1
4,5,Gayle wrong guy wrong team much like Brian Lar...,0
...,...,...,...
987624,999995,song's middle change doesn't want born..... ar...,0
987625,999996,Good luck,1
987626,999997,rather average 32370,0
987627,999998,Pickin waitin 2 hurry up...I odeeee missed dem...,0


In [None]:
data['text'] = data['text'].astype(str)

In [None]:
data

Unnamed: 0.1,Unnamed: 0,text,label
0,0,"can't open door herself, think feels pain would.",1
1,2,least lawn hasn't taken over field weeds!,1
2,3,"umm.. like, hello? where's child support payme...",0
3,4,Joined twitter,1
4,5,Gayle wrong guy wrong team much like Brian Lar...,0
...,...,...,...
987624,999995,song's middle change doesn't want born..... ar...,0
987625,999996,Good luck,1
987626,999997,rather average 32370,0
987627,999998,Pickin waitin 2 hurry up...I odeeee missed dem...,0


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(data['text'].tolist(), data['label'].tolist(), test_size=0.1)

In [None]:
data_sampled = data.sample(frac=1, random_state=42)

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data_sampled['text'].tolist(),
    data_sampled['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Tokenize the data
train_encodings = tokenize(train_texts)
test_encodings = tokenize(test_texts)

In [None]:
train_encodings

{'input_ids': tensor([[  101,  2183, 27090,  ...,     0,     0,     0],
        [  101,  2279,  2048,  ...,     0,     0,     0],
        [  101,  2183,  3509,  ...,     0,     0,     0],
        ...,
        [  101,  2134,  2102,  ...,     0,     0,     0],
        [  101, 24026,  1012,  ...,     0,     0,     0],
        [  101,  4654,  1005,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).clone().detach()
        return item

In [None]:
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score

import torch

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training function with batch-wise validation
def train(epoch, model, train_loader, test_loader, optimizer):
    model.train()
    best_val_loss = float('inf')
    best_model_state = None

    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        accuracy, val_loss = evaluate(model, test_loader)
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Batch {batch_idx}, Training Loss: {loss.item()}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()

    return best_model_state, best_val_loss


In [None]:
# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    total_loss = 0.0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            label_ids = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(label_ids)

    average_loss = total_loss / len(test_loader)
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, average_loss

In [None]:
EPOCHS = 1
best_model_state = None
best_val_loss = float('inf')

for epoch in range(EPOCHS):
    model_state, val_loss = train(epoch, model, train_loader, test_loader, optimizer)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_state = model_state


Epoch 0, Batch 0, Training Loss: 0.6901992559432983, Validation Loss: 0.6962, Accuracy: 0.4991
Epoch 0, Batch 1, Training Loss: 0.7084370851516724, Validation Loss: 0.6881, Accuracy: 0.5527
Epoch 0, Batch 2, Training Loss: 0.6797231435775757, Validation Loss: 0.6850, Accuracy: 0.5683
Epoch 0, Batch 3, Training Loss: 0.6836034059524536, Validation Loss: 0.6834, Accuracy: 0.5787
Epoch 0, Batch 4, Training Loss: 0.6806811690330505, Validation Loss: 0.6807, Accuracy: 0.5826
Epoch 0, Batch 5, Training Loss: 0.6665593981742859, Validation Loss: 0.6787, Accuracy: 0.5906
Epoch 0, Batch 6, Training Loss: 0.6942813992500305, Validation Loss: 0.6847, Accuracy: 0.5312
Epoch 0, Batch 7, Training Loss: 0.6686683297157288, Validation Loss: 0.6914, Accuracy: 0.5032
Epoch 0, Batch 8, Training Loss: 0.6629774570465088, Validation Loss: 0.6939, Accuracy: 0.5013
Epoch 0, Batch 9, Training Loss: 0.7244338393211365, Validation Loss: 0.6886, Accuracy: 0.5112
Epoch 0, Batch 10, Training Loss: 0.71426284313201

In [None]:
# Load the best model state
if best_model_state:
    model.load_state_dict(best_model_state)

# Print the parameters of the model with the minimum validation loss
print("Parameters of the model with the minimum validation loss:")
for name, param in model.named_parameters():
    print(f"{name}: {param.data}")

print("Training complete.")

## Roberta

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data_sampled['text'].tolist(),
    data_sampled['label'].tolist(),
    test_size=0.1,
    random_state=42
)

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenization function
def tokenize(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Tokenize the data
train_encodings = tokenize(train_texts)
test_encodings = tokenize(test_texts)

# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).clone().detach()
        return item

# Create datasets
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Load the model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training function
def train(epoch, model, train_loader, optimizer):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}')

# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    total_loss = 0.0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            label_ids = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(label_ids)

    average_loss = total_loss / len(test_loader)
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, average_loss

# Training loop
EPOCHS = 1
for epoch in range(EPOCHS):
    train(epoch, model, train_loader, optimizer)
    accuracy, val_loss = evaluate(model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}')

print("Training complete.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 0, Batch 0, Loss: 0.6756503582000732
Epoch 0, Batch 100, Loss: 0.3952513635158539
Epoch 0, Batch 200, Loss: 0.3906814455986023
Epoch 0, Batch 300, Loss: 0.43320342898368835
Epoch 0, Batch 400, Loss: 0.37270355224609375
Epoch 0, Batch 500, Loss: 0.48311737179756165
Epoch 0, Batch 600, Loss: 0.3824498951435089
Epoch 0, Batch 700, Loss: 0.4639070928096771
Epoch 0, Batch 800, Loss: 0.4812706708908081
Epoch 0, Batch 900, Loss: 0.4120330810546875
Epoch 0, Batch 1000, Loss: 0.5286306738853455
Epoch 0, Batch 1100, Loss: 0.38292133808135986
Epoch 0, Batch 1200, Loss: 0.5042079091072083
Epoch 0, Batch 1300, Loss: 0.4365707337856293
Epoch 0, Batch 1400, Loss: 0.43188339471817017
Epoch 0, Batch 1500, Loss: 0.5428134202957153
Epoch 0, Batch 1600, Loss: 0.4794994294643402
Epoch 0, Batch 1700, Loss: 0.3960758149623871
Epoch 0, Batch 1800, Loss: 0.3293410539627075
Epoch 0, Batch 1900, Loss: 0.4995534420013428
Epoch 0, Batch 2000, Loss: 0.30456405878067017
Epoch 0, Batch 2100, Loss: 0.34314218163

## GPT2

In [None]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a padding token, so we use eos_token

# Tokenization function
def tokenize(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Tokenize the data
train_encodings = tokenize(train_texts)
test_encodings = tokenize(test_texts)

# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).clone().detach()
        return item

# Create datasets
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Use a smaller batch size for GPT-2
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Load the model
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id  # Explicitly set the padding token id
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training function
def train(epoch, model, train_loader, optimizer):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item()}')

# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    total_loss = 0.0

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            label_ids = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(label_ids)

    average_loss = total_loss / len(test_loader)
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, average_loss

# Training loop with checkpointing
EPOCHS = 1
best_val_loss = float('inf')

for epoch in range(EPOCHS):
    train(epoch, model, train_loader, optimizer)
    accuracy, val_loss = evaluate(model, test_loader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Validation Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')
        print("Model saved!")

print("Training complete.")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch 0, Batch 0, Loss: 0.982352077960968
Epoch 0, Batch 100, Loss: 0.5775084495544434
Epoch 0, Batch 200, Loss: 0.3348778188228607
Epoch 0, Batch 300, Loss: 0.4932810068130493
Epoch 0, Batch 400, Loss: 0.6009963750839233
Epoch 0, Batch 500, Loss: 0.5519117116928101
Epoch 0, Batch 600, Loss: 0.3971640467643738
Epoch 0, Batch 700, Loss: 0.5178024172782898
Epoch 0, Batch 800, Loss: 0.4836016893386841
Epoch 0, Batch 900, Loss: 0.585265040397644
Epoch 0, Batch 1000, Loss: 0.460182249546051
Epoch 0, Batch 1100, Loss: 0.7870944142341614
Epoch 0, Batch 1200, Loss: 0.6717519164085388
Epoch 0, Batch 1300, Loss: 0.6331472396850586
Epoch 0, Batch 1400, Loss: 0.49488747119903564
Epoch 0, Batch 1500, Loss: 0.5924420952796936
Epoch 0, Batch 1600, Loss: 0.3732205033302307
Epoch 0, Batch 1700, Loss: 0.580284833908081
Epoch 0, Batch 1800, Loss: 0.6608132123947144
Epoch 0, Batch 1900, Loss: 0.8326858878135681
Epoch 0, Batch 2000, Loss: 0.4280773103237152
Epoch 0, Batch 2100, Loss: 0.6026865243911743
Epo

In [None]:
# Assuming you have completed training and saved the best model
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id  # Set padding token id
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(torch.load('best_model.pt', map_location=device))
model.to(device)
model.eval()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [None]:
device

device(type='cuda')

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

# Assuming you have the data and model loaded as mentioned earlier
# Tokenize the data
predict_encodings = tokenize(data_sampled['text'].tolist())

# Create a dataset class for prediction
class PredictDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        return item

# Create the prediction dataset and dataloader
predict_dataset = PredictDataset(predict_encodings)
predict_loader = DataLoader(predict_dataset, batch_size=8, shuffle=False)

# Load the trained model
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(torch.load('best_model.pt', map_location=device))
model.to(device)
model.eval()

# Predict function
def predict(model, predict_loader):
    predictions = []

    with torch.no_grad():
        for batch in predict_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)

    return predictions

# Perform predictions
predicted_labels = predict(model, predict_loader)

# Add the predictions to the dataframe
data_sampled['predicted_label'] = predicted_labels

# Print the first few rows to check the predictions
print(data_sampled.head())

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Conclusion: Best model based on my experiments GPT2