In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'hasoc-englishdata:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4571530%2F7806275%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240311%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240311T062623Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D84410002dce646f190bf89a3d82e9c5cff7e20d445e6d7d71520cee913261455c9a5db8e2f1727943ccb2f5c7853303af175cb588c586c47b9fa26657045a5153d95fcdc4acadd170d7ad7be798f83bda6a3d2591e872881a7bc02074b95d5b5f4f4a892a72837f354d320f9dccf2a380bd175e1827c499d13bcae4098cf09803cca6b116d41c0dcf670c787e45628b8a1da27baedf8c46430d2dc0ed69087f7d3770fe0b49b6bffd31ef95437d211b9f07020ee9a72923863c8506bd08e73dde5925236052db5c93ac3f40a04562ca6e89531b55407c04cef1f4a8e3978c914e65359e97d96a8d1e3f5b9c8d581b3a8ed5cc3c178ab27cd353c50455ed03c0a,hasoc20-english:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4575721%2F7812037%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240311%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240311T062623Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D939be738494f85d7de64cdda54ee14a47be18afc092fe7f21482ae67ecd2acce00d049256e84c9ecd00da3e9e80c4a0e49b63ec072a19b334831870dd1207744b4f075919e9e68a189ad607cbce138129173af5da8026f41c63f1492578da7907e22bcfd4136374ebf63705ffef07064817652b648ae02446ba9dec09cce8bed9e752a4aded33195c5574ff9fad69fe893811529e23aa578b163ebc78bddbeb04225392bc306fb50101a61fc329462d892a6512e7f8109effa11a6f8096fb718b817ce7ce3f7b15a5234894eec71e41cee957bd0ce2707bea823000033e965ac0c9af54cd989fc03a6c9fd31a31e04b5f4057309e9f6a37a41958dff4811f90c,hasoc19-englishdata:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4576012%2F7812392%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240311%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240311T062623Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D91822775b8963a2e26dc550d6cfc4699af7c792e2644adf3da162e654351c891c52d599282db8ee15e5b4d74166e89c04c5dda97bee18ae4d41a217795c27462047a42aac37b64925490b72eeae8e82b815ff114a785fb1435800de424d6239e3bbe26182ad55f7d7ba4e08b0640fac61307c950f4de44913d6e1b0d481d03dcfa235aa3ac1f84d2857c5b6f2cbb58265076ba16dedee90a3c7f654579ede8cd96fb1db30e98dd66a7d10b07db5b4a7a17e31d8227456b146f356bda9603b4a9c79a6176caa7933bacb38bcd71441f438526c4239ba7173656bd17abbc5cf78c3cbc4da62e5ca587848e91ffacbc8ba84f51292b092793334d7b112761550c4a'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading hasoc-englishdata, 331092 bytes compressed
Downloaded and uncompressed: hasoc-englishdata
Downloading hasoc20-english, 325976 bytes compressed
Downloaded and uncompressed: hasoc20-english
Downloading hasoc19-englishdata, 97428 bytes compressed
Downloaded and uncompressed: hasoc19-englishdata
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hasoc-englishdata/english_2021.csv
/kaggle/input/hasoc19-englishdata/english_2019_2.tsv
/kaggle/input/hasoc20-english/english_2020.xlsx


In [None]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)  # Show all columns
df = pd.read_csv('/kaggle/input/hasoc-englishdata/english_2021.csv')
df2 = pd.read_excel('/kaggle/input/hasoc20-english/english_2020.xlsx')
df3= pd.read_csv('/kaggle/input/hasoc19-englishdata/english_2019_2.tsv', sep='\t')




In [None]:
# Display the first few rows of the dataset
print(df.head())
print(df2.head())
print(df3.head())

   Unnamed: 0                       _id  \
0        4986  60c5d6bf5659ea5e55defa2c   
1        3394  60c5d6bf5659ea5e55def461   
2        1310  60c5d6bf5659ea5e55defaad   
3        3390  60c5d6bf5659ea5e55def419   
4        4626  60c5d6bf5659ea5e55def7fa   

                                                                                                                                                                                                                                                                                                                                              text  \
0                                                                                                                          @wealth if you made it through this &amp;&amp; were not only able to start making money for yourself but sustain living that way all from home, fuck these companies &amp; corporate pigs. power to the people, always.   
1                                                        

In [None]:
print(df.columns)
print(df2.columns)
print(df3.columns)

Index(['Unnamed: 0', '_id', 'text', 'task_1', 'task_2'], dtype='object')
Index(['tweet_id', 'text', 'task1', 'task2', 'ID'], dtype='object')
Index(['text_id', 'text', 'task_1', 'task_2', 'task_3'], dtype='object')


In [None]:
df = df.drop(['Unnamed: 0','_id','task_2'], axis=1)
df2= df2.drop(['tweet_id','task2','ID'], axis=1)
df3= df3.drop(['text_id','task_2', 'task_3'], axis=1)


In [None]:
df.rename(columns={'task_1': 'label'}, inplace=True)
df2.rename(columns={'task1': 'label'}, inplace=True)
df3.rename(columns={'task_1': 'label'}, inplace=True)

**Combine datasets**

In [None]:
# Concatenate DataFrames along rows
df = pd.concat([df, df2,df3], ignore_index=True)

In [None]:
df.columns

Index(['text', 'label'], dtype='object')

**Remove duplicate rows**

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(8693, 2)

**Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'label' column
df['label'] = label_encoder.fit_transform(df['label'])
print(df.head())  #Hate means 0

                                                                                                                                                                                                                                                                                                                                              text  \
0                                                                                                                          @wealth if you made it through this &amp;&amp; were not only able to start making money for yourself but sustain living that way all from home, fuck these companies &amp; corporate pigs. power to the people, always.   
1                                                                                                                                                                                                                                                               Technically that's still turning back the clock, dick head h

**Convert emojis to text**

In [None]:
!pip install emoji


Collecting emoji
  Downloading emoji-2.10.1-py2.py3-none-any.whl (421 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/421.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/421.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.10.1


In [None]:
import emoji

# Function to demojize text
def demojize_text(text):
    return emoji.demojize(text)

# Apply the function to the 'text' column
df['text'] = df['text'].apply(demojize_text)


**Lowercasing**

In [None]:
#make lowercase
df['text'] = df['text'].str.lower()


**Remove unwanted patterns in text**

In [None]:
import re
import string


# Remove @usernames
df['text'] = df['text'].apply(lambda x: re.sub(r'@[^ ]+', '', x))

# Remove &amp
df['text'] = df['text'].apply(lambda x: re.sub(r'&amp', '', x))

# Remove URLs from the 'text' column
df['text'] = df['text'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x))

#remove numbers
df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))

#remove newline character
df['text'] = df['text'].apply(lambda x: re.sub(r'\n', '', x))


print(df['text'].head())

0                                                           if you made it through this ;; were not only able to start making money for yourself but sustain living that way all from home, fuck these companies ; corporate pigs. power to the people, always.
1                                                                                                                                                                                                   technically that's still turning back the clock, dick head 
2            and you're the govt?!?! stop thinking about world media, liberal gangs or any optics whatsoever and act now already.  if this is what a person at your level is facing then shudder to think the plight of common people in bengal. #bengalburning
3                                                                                                                                                                                                                            soldier of 

**Expand contractions**

In [None]:
!pip install contractions
import contractions

def expand_contractions(text):
    # Use the contractions library to expand contractions
    expanded_text = contractions.fix(text)
    return expanded_text
df['text'] = df['text'].apply(lambda x: expand_contractions(x))
print(df['text'].head())

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
0                                            

**Remove punctuations**

In [None]:
def remove_punc1(text):
    return text.translate(str.maketrans('','',string.punctuation.replace('#','')))

df['text']=df['text'].apply(remove_punc1)
print(df['text'].head())

0                                                            if you made it through this  were not only able to start making money for yourself but sustain living that way all from home fuck these companies  corporate pigs power to the people always
1                                                                                                                                                                                             technically that is still turning back the clock dick head 
2            and you are the govt stop thinking about world media liberal gangs or any optics whatsoever and act now already  if this is what a person at your level is facing then shudder to think the plight of common people in bengal #bengalburning
3                                                                                                                                                                                                                      soldier of japan who has dick head


**Remove stopwords**

In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def custom_tokenize(text):
    # Tokenize the text using the default word tokenizer
    tokens = word_tokenize(text)

    # Merge '#' and its following text into a single token
    merged_tokens = []
    i = 0
    while i < len(tokens):
        if tokens[i] == '#' and i + 1 < len(tokens):
            merged_tokens.append('#' + tokens[i + 1])
            i += 2
        else:
            merged_tokens.append(tokens[i])
            i += 1

    return merged_tokens



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
# Get the list of English stop words
stop_words = set(stopwords.words('english'))

# Function to remove stop words from text
def remove_stop_words(text):
    words = custom_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Apply the function to the 'review_text' column
df['text'] = df['text'].apply(remove_stop_words)
print(df['text'].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0                                                     made able start making money sustain living way home fuck companies corporate pigs power people always
1                                                                                                             technically still turning back clock dick head
2    govt stop thinking world media liberal gangs optics whatsoever act already person level facing shudder think plight common people bengal #bengalburning
3                                                                                                                                    soldier japan dick head
4                                                                                                              would better asking think sleazy shitbag lmao
Name: text, dtype: object


**Remove non english text**

In [None]:
import re
def remove_non_english(text):
    # Replace non-English characters with an empty string
    cleaned_text = re.sub(r'[^a-zA-Z\s#]', '', text)
    return cleaned_text

# Assuming df is your DataFrame with a 'text' column
df['text'] = df['text'].apply(remove_non_english)
print(df['text'].head())

0                                                     made able start making money sustain living way home fuck companies corporate pigs power people always
1                                                                                                             technically still turning back clock dick head
2    govt stop thinking world media liberal gangs optics whatsoever act already person level facing shudder think plight common people bengal #bengalburning
3                                                                                                                                    soldier japan dick head
4                                                                                                              would better asking think sleazy shitbag lmao
Name: text, dtype: object


**Remove repeated characters in words e.g noooo**

In [None]:
!pip install nltk




In [None]:
import nltk
import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')

# Now you can import the NLTK resources as usual
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /kaggle/working/...


In [None]:
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'

    def replace(self, word):
        if wordnet.synsets(word):
            return word

        repl_word = self.repeat_regexp.sub(self.repl, word)

        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word


In [None]:
# Instantiate the RepeatReplacer class
replacer = RepeatReplacer()

# Function to apply the RepeatReplacer on the 'text' column
def apply_replacer(text):
    words = text.split()
    replaced_words = [replacer.replace(word) for word in words]
    return ' '.join(replaced_words)

# Apply the function to the 'text' column
df['text'] = df['text'].apply(apply_replacer)

# Display the DataFrame with cleaned text
print(df['text'].head())

0                                                     made able start making money sustain living way home fuck companies corporate pigs power people always
1                                                                                                             technically still turning back clock dick head
2    govt stop thinking world media liberal gangs optics whatsoever act already person level facing shudder think plight common people bengal #bengalburning
3                                                                                                                                    soldier japan dick head
4                                                                                                              would better asking think sleazy shitbag lmao
Name: text, dtype: object


**Lemmatization**

In [None]:
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')



# Function to perform lemmatization on English text
def perform_lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    words = custom_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

# Apply the function to the 'text' column
df['text'] = df['text'].apply(perform_lemmatization)

# Display the DataFrame with lemmatized text
print(df['text'].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


0                                                       made able start making money sustain living way home fuck company corporate pig power people always
1                                                                                                            technically still turning back clock dick head
2    govt stop thinking world medium liberal gang optic whatsoever act already person level facing shudder think plight common people bengal #bengalburning
3                                                                                                                                   soldier japan dick head
4                                                                                                             would better asking think sleazy shitbag lmao
Name: text, dtype: object


In [None]:
print(df[df['label']==0].head())

                                                                                                  text  \
0  made able start making money sustain living way home fuck company corporate pig power people always   
1                                                       technically still turning back clock dick head   
3                                                                              soldier japan dick head   
4                                                        would better asking think sleazy shitbag lmao   
5                                                                                                 dick   

   label  
0      0  
1      0  
3      0  
4      0  
5      0  


**Tokenization**

In [None]:
!pip install transformers




In [None]:
import pandas as pd
from transformers import BertTokenizer

# Load mBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def tokenize_long_text(text, max_length=512, stride=100):
    tokens = []
    for i in range(0, len(text), stride):
        chunk = text[i:i+max_length]
        chunk_tokens = tokenizer.encode(chunk, add_special_tokens=True)
        tokens.extend(chunk_tokens)
    return tokens


# Tokenize the 'text' column with sliding window
df['tokenized_text'] = df['text'].apply(lambda x: tokenize_long_text(x, max_length=512, stride=100))

# Display the DataFrame with tokenized tweets
print(df.head())

                                                                                                                                                     text  \
0                                                     made able start making money sustain living way home fuck company corporate pig power people always   
1                                                                                                          technically still turning back clock dick head   
2  govt stop thinking world medium liberal gang optic whatsoever act already person level facing shudder think plight common people bengal #bengalburning   
3                                                                                                                                 soldier japan dick head   
4                                                                                                           would better asking think sleazy shitbag lmao   

   label  \
0      0   
1      0   
2      1   
3      0 

**Sentence embedding**

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m61.4/68.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199773 sha256=863daa9e2f3cf8849927927e72fbd82f850f6065fcf0aa22d547e4303d074a91
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fa

In [None]:
import os
import fasttext.util

# Specify the path where you want to save the model
model_path = '/content/cc.en.300.bin.gz.part'

# Download the pre-trained FastText model for English
fasttext.util.download_model('en', if_exists='ignore')

# Move the downloaded model to the specified path
os.rename('cc.en.300.bin', model_path)

# Load the pre-trained model
model = fasttext.load_model(model_path)


Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz





In [None]:
# Function to get the sentence vector using FastText
def get_sentence_vector(text):
    return model.get_sentence_vector(text)

# Apply the function to the 'text' column to get sentence vectors
df['text_vector'] = df['text'].apply(get_sentence_vector)


In [None]:
print(df.head())

                                                                                                                                                     text  \
0                                                     made able start making money sustain living way home fuck company corporate pig power people always   
1                                                                                                          technically still turning back clock dick head   
2  govt stop thinking world medium liberal gang optic whatsoever act already person level facing shudder think plight common people bengal #bengalburning   
3                                                                                                                                 soldier japan dick head   
4                                                                                                           would better asking think sleazy shitbag lmao   

   label  \
0      0   
1      0   
2      1   
3      0 

**Convert to csv**

In [None]:
df.to_csv('english_preprocessed.csv', index=False)
