<a href="https://colab.research.google.com/github/Shamvv/NLP-Project/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install gensim
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296184 sha256=61c70053a4d85c28c0bf3e04f8d214557353a995b3c19f12d4b82f84c6231e52
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58e02cec2ddb20ce3e59fad8d3c92a
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Success

In [7]:
import pandas as pd
import string
import nltk
import spacy
import fasttext.util
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import numpy as np

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

# Load the dataset
df = pd.read_csv('/content/sham.csv')

# Define the column with the tokenized text (Column 2)
text_column = 'PDF3_Content_Tokens'

# Convert the text column with the list of tokens into a single string (since it might be a list of tokens)
df[text_column] = df[text_column].apply(lambda x: ' '.join(eval(x)) if isinstance(x, str) else x)

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

### Task 1: Text Preprocessing (Lowercasing, Punctuation Removal, Stopwords, Lemmatization)
def preprocess_text(text):
    if isinstance(text, str):
        # 1. Convert to lowercase
        text = text.lower()

        # 2. Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))

        # 3. Tokenize the text
        tokens = word_tokenize(text)

        # 4. Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]

        # 5. Lemmatize the tokens
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

        # Join tokens back to a single string
        return ' '.join(lemmatized_tokens)
    return ""

# Apply preprocessing to the text column
df['Preprocessed_Text'] = df[text_column].apply(preprocess_text)

### Task 2: Tokenization (Word-level and Sentence-level)
# Word-level tokenization
df['Word_Tokens'] = df['Preprocessed_Text'].apply(word_tokenize)

# Sentence-level tokenization
df['Sentence_Tokens'] = df['Preprocessed_Text'].apply(sent_tokenize)

### Task 3: Part-of-Speech (POS) Tagging
def pos_tagging(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        return nltk.pos_tag(tokens)
    return []

# Apply POS tagging
df['POS_Tags'] = df['Preprocessed_Text'].apply(pos_tagging)

### Task 4: Named Entity Recognition (NER)
def ner_extraction(text):
    if isinstance(text, str):
        doc = nlp(text)
        return [(ent.text, ent.label_) for ent in doc.ents]
    return []

# Apply NER extraction
df['NER_Tags'] = df['Preprocessed_Text'].apply(ner_extraction)

### Task 5: Sentiment Analysis
def sentiment_analysis(text):
    if isinstance(text, str):
        blob = TextBlob(text)
        return blob.sentiment.polarity  # Returns a value between -1 (negative) and 1 (positive)
    return 0  # Neutral sentiment for invalid input

# Apply Sentiment Analysis
df['Sentiment'] = df['Preprocessed_Text'].apply(sentiment_analysis)

### Task 6: FastText Embedding
import gensim.downloader as api

# Load pretrained FastText model
ft_model = api.load("fasttext-wiki-news-subwords-300")

# Function to get FastText embeddings for each word and aggregate to represent the document
def get_fasttext_embedding(text):
    tokens = word_tokenize(text)
    embeddings = []
    for token in tokens:
        if token in ft_model:
            embeddings.append(ft_model[token])
    if len(embeddings) > 0:
        # Average the embeddings to get a fixed-size representation for the document
        return np.mean(embeddings, axis=0)
    else:
        # Return a zero vector if no embeddings were found
        return np.zeros(300)

# Apply FastText embeddings to each document
df['FastText_Embedding'] = df['Preprocessed_Text'].apply(get_fasttext_embedding)

# Save the updated DataFrame to a new CSV file with embeddings
df.to_csv('/content/sham_processed_with_embeddings.csv', index=False)

# Display the first few rows of the processed DataFrame with embeddings
df.head()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




Unnamed: 0,PDF3_Content,PDF3_Content_Tokens,Preprocessed_Text,Word_Tokens,Sentence_Tokens,POS_Tags,NER_Tags,Sentiment,FastText_Embedding
0,BNS Refer ]Chapter Of robbery and dacoity309. ...,from old ipcalso refer chapter of robbery and ...,old ipcalso refer chapter robbery dacoity309 r...,"[old, ipcalso, refer, chapter, robbery, dacoit...",[old ipcalso refer chapter robbery dacoity309 ...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (chap...","[(z, DATE), (ten thousand, CARDINAL), (child4,...",0.025467,"[0.0042160857, 0.0020693878, 0.0141438, 0.0090..."
1,BNS Refer ]5. Commutation of sentence.-The app...,from old ipcalso refer 5 commutation of senten...,old ipcalso refer 5 commutation sentencethe ap...,"[old, ipcalso, refer, 5, commutation, sentence...",[old ipcalso refer 5 commutation sentencethe a...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (5, C...","[(5, CARDINAL), (section 474, LAW), (2023expla...",0.275,"[0.013676854, -0.0075990492, 0.017068015, 0.00..."
2,"BNS Refer ]19. Act likely to cause harm, but d...",from old ipcalso refer 19 act likely to cause ...,old ipcalso refer 19 act likely cause harm don...,"[old, ipcalso, refer, 19, act, likely, cause, ...",[old ipcalso refer 19 act likely cause harm do...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (19, ...","[(19, CARDINAL), (twenty thirty, QUANTITY), (t...",0.030952,"[-0.01996835, 0.002478866, 0.006519607, 0.0172..."
3,BNS Refer ]248. False charge of offence made w...,from old ipcalso refer 248 false charge of off...,old ipcalso refer 248 false charge offence mad...,"[old, ipcalso, refer, 248, false, charge, offe...",[old ipcalso refer 248 false charge offence ma...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (248,...","[(248, CARDINAL), (five year, DATE), (two, CAR...",-0.143939,"[-0.01277735, -0.003020652, 0.022983454, -0.00..."
4,BNS Refer ]355. Misconduct in public by a drun...,from old ipcalso refer 355 misconduct in publi...,old ipcalso refer 355 misconduct public drunke...,"[old, ipcalso, refer, 355, misconduct, public,...",[old ipcalso refer 355 misconduct public drunk...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (355,...","[(355, CARDINAL), (one thousand, CARDINAL)]",0.103333,"[-0.010172161, -0.017274583, 0.026015742, 0.00..."


In [8]:
import fasttext

# Step 1: Prepare the dataset for FastText
# Save the preprocessed text into a plain text file where each line is a document
with open('/content/sham_preprocessed.txt', 'w') as f:
    for text in df['Preprocessed_Text']:
        if isinstance(text, str):
            f.write(text + '\n')

# Step 2: Train a FastText model on your dataset
# Specify parameters for training
# You can adjust these parameters (e.g., epoch, lr, etc.) for fine-tuning
model = fasttext.train_unsupervised('/content/sham_preprocessed.txt', model='skipgram',
                                    dim=300, epoch=10, lr=0.05, wordNgrams=2, minCount=1)

# Step 3: Save the fine-tuned FastText model
model.save_model('/content/fasttext_finetuned_model.bin')

# Step 4: Use the fine-tuned FastText model to get embeddings
def get_finetuned_fasttext_embedding(text):
    tokens = word_tokenize(text)
    embeddings = []
    for token in tokens:
        if token in model:
            embeddings.append(model.get_word_vector(token))
    if len(embeddings) > 0:
        # Average the embeddings to get a fixed-size representation for the document
        return np.mean(embeddings, axis=0)
    else:
        # Return a zero vector if no embeddings were found
        return np.zeros(300)

# Apply the fine-tuned FastText embeddings to each document
df['Finetuned_FastText_Embedding'] = df['Preprocessed_Text'].apply(get_finetuned_fasttext_embedding)

# Save the updated DataFrame with fine-tuned embeddings
df.to_csv('/content/sham_processed_with_finetuned_embeddings.csv', index=False)

# Display the first few rows of the processed DataFrame with fine-tuned embeddings
df.head()


Unnamed: 0,PDF3_Content,PDF3_Content_Tokens,Preprocessed_Text,Word_Tokens,Sentence_Tokens,POS_Tags,NER_Tags,Sentiment,FastText_Embedding,Finetuned_FastText_Embedding
0,BNS Refer ]Chapter Of robbery and dacoity309. ...,from old ipcalso refer chapter of robbery and ...,old ipcalso refer chapter robbery dacoity309 r...,"[old, ipcalso, refer, chapter, robbery, dacoit...",[old ipcalso refer chapter robbery dacoity309 ...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (chap...","[(z, DATE), (ten thousand, CARDINAL), (child4,...",0.025467,"[0.0042160857, 0.0020693878, 0.0141438, 0.0090...","[0.0060935947, 0.017565543, -0.10015271, -0.10..."
1,BNS Refer ]5. Commutation of sentence.-The app...,from old ipcalso refer 5 commutation of senten...,old ipcalso refer 5 commutation sentencethe ap...,"[old, ipcalso, refer, 5, commutation, sentence...",[old ipcalso refer 5 commutation sentencethe a...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (5, C...","[(5, CARDINAL), (section 474, LAW), (2023expla...",0.275,"[0.013676854, -0.0075990492, 0.017068015, 0.00...","[-0.008262893, 0.021229737, -0.08821842, -0.08..."
2,"BNS Refer ]19. Act likely to cause harm, but d...",from old ipcalso refer 19 act likely to cause ...,old ipcalso refer 19 act likely cause harm don...,"[old, ipcalso, refer, 19, act, likely, cause, ...",[old ipcalso refer 19 act likely cause harm do...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (19, ...","[(19, CARDINAL), (twenty thirty, QUANTITY), (t...",0.030952,"[-0.01996835, 0.002478866, 0.006519607, 0.0172...","[-0.008598564, 0.024581244, -0.089028925, -0.0..."
3,BNS Refer ]248. False charge of offence made w...,from old ipcalso refer 248 false charge of off...,old ipcalso refer 248 false charge offence mad...,"[old, ipcalso, refer, 248, false, charge, offe...",[old ipcalso refer 248 false charge offence ma...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (248,...","[(248, CARDINAL), (five year, DATE), (two, CAR...",-0.143939,"[-0.01277735, -0.003020652, 0.022983454, -0.00...","[0.022292279, 0.008449088, -0.10749799, -0.153..."
4,BNS Refer ]355. Misconduct in public by a drun...,from old ipcalso refer 355 misconduct in publi...,old ipcalso refer 355 misconduct public drunke...,"[old, ipcalso, refer, 355, misconduct, public,...",[old ipcalso refer 355 misconduct public drunk...,"[(old, JJ), (ipcalso, NN), (refer, VBP), (355,...","[(355, CARDINAL), (one thousand, CARDINAL)]",0.103333,"[-0.010172161, -0.017274583, 0.026015742, 0.00...","[0.011175511, 0.013544768, -0.09707727, -0.123..."


In [None]:
import numpy as np

# Prepare a function to extract document embeddings from FastText model
def get_finetuned_fasttext_embedding(text, model):
    # Ensure the input is a valid string
    if isinstance(text, str):
        tokens = text.split()  # assuming text is preprocessed
        embeddings = [model.get_word_vector(token) for token in tokens if token in model]
        if len(embeddings) > 0:
            # Average the word embeddings to get a document embedding
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(300)
    else:
        # If the text is not valid, return a zero vector
        return np.zeros(300)

# Extrinsic evaluation: Text Classification Task
def classify_with_embeddings(df, model):
    # Get document embeddings
    X = np.vstack(df['Preprocessed_Text'].apply(lambda text: get_finetuned_fasttext_embedding(text, model)))

    # Assuming 'label' column exists in your dataset for classification
    y = df['label']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a classifier (logistic regression)
    classifier = LogisticRegression(max_iter=1000)
    classifier.fit(X_train, y_train)

    # Make predictions and compute accuracy
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# Perform text classification to evaluate the embeddings
classification_accuracy = classify_with_embeddings(df, model)

# Display the result
print(f"Text classification accuracy: {classification_accuracy}")
