# Installing Libraries and Imports

In [1]:
!pip install -U accelerate
!pip install -U transformers

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m256.0/309.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [116]:
import torch
from textblob import TextBlob
from colorama import Fore, Style
from tabulate import tabulate
from umap.umap_ import UMAP
from sklearn.manifold import TSNE, trustworthiness
from sklearn.decomposition import PCA , TruncatedSVD


# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In this notebook, you will be working with a Large Language Model (LLM) and explore its capabilities to help you solve various problems.

# Loading Model

We will be using Phi-3 as our LLM.

In [None]:
MODEL_ARGS = {
    'Name': 'microsoft/Phi-3-mini-128k-instruct',
    'DType': torch.bfloat16 # add torch.
}
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def load_model(model_args):


    model = AutoModelForCausalLM.from_pretrained(
        model_args['Name'],
        trust_remote_code=True,
        torch_dtype=model_args['DType'], #remove torch.
        low_cpu_mem_usage=True,
        device_map={"": device},
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args['Name'],
        trust_remote_code=True,
    )

    return model, tokenizer

In [5]:
model, tokenizer = load_model(MODEL_ARGS)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# First Inference

In [None]:
def generate_text(model, tokenizer, prompt, max_new_tokens = 100, do_sample=True, temperature=0.5):

    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    if do_sample:
        output_ids = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature)
    else:
        output_ids = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=do_sample)

    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return output_text[len(prompt):]

Lets break down this function:

**Arguments**:

* **model**: The language model used for text generation.
* **tokenizer**: The tokenizer that converts text to tokens and vice versa.
* **prompt**: The initial text input that the model will build upon.
* **max_new_tokens**: The maximum number of new tokens to generate.
* **do_sample**: Whether to sample the next token or use deterministic decoding.
* **temperature**: Controls the randomness of sampling; higher values produce more diverse outputs. ( model creativity )

**Functionality**:

The generate_text function creates more text based on a given starting prompt using a language model and tokenizer. It first converts the prompt into tokens (numbers the model understands), then generates additional tokens to continue the text. Depending on settings, it can generate text randomly or in a fixed way. Finally, it converts the tokens back into readable text and returns the part that extends beyond the original prompt.

## Without template

In [7]:
prompt = """Tell me a funny story about a cute cat"""

generate_text(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    max_new_tokens=200,
    temperature=1.2,
)



''

## With template

In [12]:
prompt = """Insturction: Tell me a funny story about a cute cat
Answer:"""

generate_text(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    max_new_tokens=200,
    temperature=1.2,
)

" Once upon a time, there was a cute and playful cat named Miss Fluffernisspuff. She lived with an old lady who always told people that Miss Fluffernisspuff was exceptionally intelligent. One day, the old lady decided to test if her claim was true.\n\nIn the midst of their afternoon chillaxation, the old lady left for a brief moment to fetch some tea. Seizing the opportunity, Miss Fluffernisspuff grabbed an old, wrinkled shirt from the laundry basket and, with unmatched elegance and skill, started to do her own version of the ‘lady's laundry dance routine,' clawing, stitching, ruffling, and wrinkling the shirt simultaneously. Of course, it looked a disaster.\n\nWhen the old lady returned home, she saw her beloved cat wearing the shirt like a cape"

As you can see, the output generated by these models depends on the prompt provided.

But that's just the beginning! Let's try different prompt layouts

( You can use the keyword "Prompt Engineering" for more information )

# In Context Learning ( ICL )

LLMs can learn from their prompts, as you can give it examples or guide it and teach it how to solve the problem.

## Learning from examples

### No example

In [9]:

prompt = """Question: John volunteers at a shelter once a week for 3 hours at a time. How many hours does he volunteer per year?
Answer:"""

generate_text(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    max_new_tokens=200,
    temperature=0.2,
)

' John volunteers 192 hours per year.'

The right answer is ( 12 * 4 ) * 3 = 144

### One Example

In [10]:

prompt = """Question: John volunteers at a shelter once a week for 7 hours at a time. How many hours does he volunteer per year?
Answer: John volunteers 336 hours per year.
Question: John volunteers at a shelter once a week for 3 hours at a time. How many hours does he volunteer per year?
Answer:"""

generate_text(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    max_new_tokens=10,
    do_sample=False,
    temperature=0.0,
)

' John volunteers 156 hours per year'

Examples are not always effective for mathematical problems, so let's try another method.

## Chain of Thoughts ( CoT )

In [11]:

prompt = """Question: John volunteers at a shelter once a week for 7 hours at a time. How many hours does he volunteer per year?
Answer: There are 12 months in one year and 4 weeks in each month. So in one year, there are 12 * 4 = 48 weeks. If Jhon volunteers at a shelter once a week for 7 hours,
John volunteers 48 * 7 = 336 hours per year.
Question: John volunteers at a shelter once a week for 3 hours at a time. How many hours does he volunteer per year?
Answer:"""

generate_text(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    max_new_tokens=70,
    do_sample=False,
    temperature=0.0,
)

' There are 12 months in one year and 4 weeks in each month. So in one year, there are 12 * 4 = 48 weeks. If John volunteers at a shelter once a week for 3 hours,\nJohn volunteers 48 * 3 = 144 hours per year.'

In Chain of Thought (CoT), we guide the model with one or more examples and provide it with the steps to solve the problem.

In [2]:
import pandas as pd

In [128]:
df_train = pd.read_csv("CSVs/train.csv")
df_test = pd.read_csv("CSVs/test.csv")
df_unlabeled = pd.read_csv("CSVs/unlabeled.csv")

In [38]:
def generate_label(review_text, model, tokenizer, max_length=512, do_sample=True,temperature=0.5):
    prompt = f"""
    Question: i will give you a review and if it is positive you answer with Positive and if Negative you answer with Negative: This has to be one of my favourite movies of all time. The dialogue, with the constant use of puns is very tight, the cast are superb, and the plot is highly original. Don't take my word for it - watch this movie and enjoy it for yourself.
    Answer: Positive
    Question: i will give you a review and if it is positive you answer with Positive and if Negative you answer with Negative: 'needed an excuse to get out of the house while paint dried - left the movie after an hour to return and watch the paint dry. I don't recall ever walking out on a movie before, but I really tried to stay. The script was not up to the cast and just kept 'going and going' badly - come on! Uma Thurman doing this stuff? Fairly lame special effects. These were older characters and actors doing superficial horny 20-something lives - just sort of annoying and wrong feeling. This review is based only on the first hour - it might have gotten better. I just had to get home and see if the paint dried a darker shade than when it went on.'
    Answer: Negative
    Question: i will give you a review and if it is positive you answer with Positive and if Negative you answer with Negative: 'this film was the worst film i have ever viewed. it was like a 'homework assignment' for a film class. it totally misses the mark when it comes to the 'message' it is trying to relay. characters are over exaggerated, poor acting and as for a plot...well it is utterly ridiculous. the cover shot is what made me think it may be a decent film, the co-actor is handsome and that's about it. moral of this movie: never judge a movie by its cover! save your time, money and energy and make your own home movie and you will be far better off than i. it was painful to watch and quite frankly i am surprised that anyone would spend money to make and distribute it!'
    Answer: Negative
    Question: i will give you a review and if it is positive you answer with Positive and if Negative you answer with Negative: {review_text}
    Answer:"""

    generated_text_indiced = generate_text(
        model=model,
        tokenizer=tokenizer,
        prompt=prompt,
        max_new_tokens=50,
        do_sample=False,
        temperature=0.0,
    )
    # print(generated_text_indiced)

    if "Positive" in generated_text_indiced:
        return 1
    elif "Negative" in generated_text_indiced:
        return 0
    else:
        return None  # Unable to classify

In [40]:
df_unlabeled["text"][40]



In [129]:
df_unlabeled_withLLM = pd.read_csv("CSVs/output.csv")
# df_unlabeled['label'] = df_unlabeled['text'].apply(lambda x: generate_label(x, model, tokenizer))
df_unlabeled = df_unlabeled_withLLM.dropna(subset=['label'])

In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score , pairwise_distances_argmin_min


def train_and_evaluate_classifiers(X_train, y_train, X_test, y_test):
    classifiers = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Support Vector Machine": SVC(probability=True, random_state=42)
    }

    scores_list = []
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
        scores_list.append([name, accuracy, precision, recall, f1, roc_auc])

    headers = ["Classifier", "Accuracy", "Precision", "Recall", "F1 Score", "AUC-ROC"]
    colored_headers = [Fore.GREEN + header + Fore.RESET for header in headers]
    def format_score(score):
        if isinstance(score, float):
            return Fore.YELLOW + "{:.6f}".format(score) + Fore.RESET
        else:
            return Fore.YELLOW + str(score) + Fore.RESET

    colored_scores_list = [[format_score(score) for score in row] for row in scores_list]
    print(Fore.CYAN + Style.BRIGHT + "Classifier Evaluation Scores" + Style.RESET_ALL)
    print(tabulate(colored_scores_list, headers=colored_headers, tablefmt="fancy_grid", numalign="center", stralign="center", missingval="-"))

In [131]:
def get_feature_columns_name(df):
    features_columns = []
    for i in range(len(df["embedding"][0])):
        name = "feature" + f"{i + 1}"
        features_columns.append(name)
    return features_columns
feature_columns = get_feature_columns_name(df_train)

In [132]:
df_test.drop("Unnamed: 0", axis=1, inplace=True)
df_train.drop("Unnamed: 0", axis=1,inplace=True)
df_unlabeled.drop("Unnamed: 0", axis=1,inplace=True)
df_unlabeled_withLLM.drop("Unnamed: 0", axis=1,inplace=True)

In [133]:

df_train['embedding'] = df_train['embedding'].apply(lambda x: [float(i) for i in x.strip("[]").split(",")])
df_test['embedding'] = df_test['embedding'].apply(lambda x: [float(i) for i in x.strip("[]").split(",")])
df_unlabeled['embedding'] = df_unlabeled['embedding'].apply(lambda x: [float(i) for i in x.strip("[]").split(",")])

In [134]:
df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
df_test[get_feature_columns_name(df_test)] = df_test["embedding"].apply(pd.Series)
df_unlabeled[get_feature_columns_name(df_unlabeled)] = df_unlabeled["embedding"].apply(pd.Series)

  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_name(df_train)] = df_train["embedding"].apply(pd.Series)
  df_train[get_feature_columns_n

In [135]:
def calculate_polarity(text):
    return TextBlob(text).sentiment.polarity

In [138]:
df_unlabeled_withLLM['embedding'] = df_unlabeled_withLLM['embedding'].apply(lambda x: [float(i) for i in x.strip("[]").split(",")])
df_unlabeled_withLLM[get_feature_columns_name(df_unlabeled_withLLM)] = df_unlabeled_withLLM["embedding"].apply(pd.Series)

AttributeError: 'list' object has no attribute 'strip'

In [139]:
df_train["polarity"] = df_train["text"].apply(calculate_polarity)
df_test["polarity"] = df_test["text"].apply(calculate_polarity)
df_unlabeled["polarity"] = df_unlabeled["text"].apply(calculate_polarity)
# df_unlabeled_withLLM["polarity"] = df_unlabeled_withLLM["text"].apply(calculate_polarity)

  df_train["polarity"] = df_train["text"].apply(calculate_polarity)
  df_test["polarity"] = df_test["text"].apply(calculate_polarity)
  df_unlabeled["polarity"] = df_unlabeled["text"].apply(calculate_polarity)


In [140]:
feature_columns = get_feature_columns_name(df_train)
expanded_feature_columns = feature_columns + ["polarity"]

In [141]:
firsthalf_df = pd.read_csv("CSVs/f.csv")
secondhalf_df = pd.read_csv("CSVs/s.csv")
full_df = pd.concat([firsthalf_df, secondhalf_df], ignore_index=True)

In [142]:
full_df.drop("Unnamed: 0", axis=1, inplace=True)
full_df.drop("Unnamed: 0.1", axis=1, inplace=True)

In [143]:
full_df.head()

Unnamed: 0,text,embedding,label
0,there is no relation at all between fortier an...,"[-0.097577557, -0.1536363065, 0.311417222, 0.0...",1
1,in the process of trying to establish the audi...,"[-0.0003366936, 0.0877778083, -0.0071643554, 0...",0
2,i give this movie 7 out of 10 because the vill...,"[-0.275570631, -0.3291363716, 0.079317905, 0.0...",1
3,this is the best sci-fi that i have seen in my...,"[0.1461943835, -0.2785910368, 0.4456491172, -0...",1
4,what an appalling piece of rubbish!!! who are ...,"[0.1696606129, 0.354041934, 0.4451519549, -0.0...",0


In [144]:
full_df['embedding'] = full_df['embedding'].apply(lambda x: [float(i) for i in x.strip("[]").split(",")])

In [145]:
full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)

  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding"].apply(pd.Series)
  full_df[get_feature_columns_name(full_df)] = full_df["embedding

In [146]:
full_df["polarity"] = full_df["text"].apply(calculate_polarity)

  full_df["polarity"] = full_df["text"].apply(calculate_polarity)


In [147]:
full_df.dropna()

Unnamed: 0,text,embedding,label,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature760,feature761,feature762,feature763,feature764,feature765,feature766,feature767,feature768,polarity
0,there is no relation at all between fortier an...,"[-0.097577557, -0.1536363065, 0.311417222, 0.0...",1,-0.097578,-0.153636,0.311417,0.030520,0.211818,-0.151885,0.006378,...,-0.182791,-0.053788,-0.231376,-0.308851,-0.024332,0.048270,-0.082222,0.578135,-0.032731,0.091667
1,in the process of trying to establish the audi...,"[-0.0003366936, 0.0877778083, -0.0071643554, 0...",0,-0.000337,0.087778,-0.007164,0.118093,0.312951,-0.079622,0.150585,...,0.226954,-0.120762,-0.199064,-0.079021,-0.209518,0.162193,-0.115619,-0.097856,0.017722,0.366667
2,i give this movie 7 out of 10 because the vill...,"[-0.275570631, -0.3291363716, 0.079317905, 0.0...",1,-0.275571,-0.329136,0.079318,0.071611,0.177593,-0.131866,-0.020589,...,-0.008902,0.123043,-0.154760,-0.110534,-0.012030,0.209303,0.026651,0.440879,-0.037475,0.025787
3,this is the best sci-fi that i have seen in my...,"[0.1461943835, -0.2785910368, 0.4456491172, -0...",1,0.146194,-0.278591,0.445649,-0.009135,0.208270,-0.144610,0.070797,...,-0.093624,0.050069,-0.243474,-0.148038,0.213451,0.133828,-0.065608,0.486251,-0.103015,0.556250
4,what an appalling piece of rubbish!!! who are ...,"[0.1696606129, 0.354041934, 0.4451519549, -0.0...",0,0.169661,0.354042,0.445152,-0.075730,0.028488,-0.000389,0.021385,...,-0.171072,-0.179506,-0.270028,-0.206235,-0.012388,0.047808,-0.122534,0.306808,0.199574,-0.248633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,unbelievable!<br /><br />this film gets a 7 ou...,"[-0.0955021083, 0.0211753864, 0.3570575416, -0...",0,-0.095502,0.021175,0.357058,-0.132645,0.191091,-0.118738,0.082526,...,-0.069684,0.018771,-0.347073,-0.250546,0.129283,0.204098,-0.208941,0.457088,0.008937,-0.250000
1010,sweet romantic drama/comedy about stewart and ...,"[0.017505046, -0.0501609854, 0.4082049727, -0....",1,0.017505,-0.050161,0.408205,-0.076074,0.485385,-0.150266,0.158359,...,-0.359928,-0.083250,-0.254106,-0.380683,0.103381,0.076058,-0.203719,0.190500,-0.017864,0.340000
1011,"personally, i disdain the jerry springer show,...","[-0.196471706, -0.0579777397, 0.1792553961, -0...",1,-0.196472,-0.057978,0.179255,-0.106598,0.059643,-0.176397,0.009714,...,0.028779,-0.088875,-0.108830,-0.082903,-0.035345,0.396249,0.212461,0.426900,0.063165,-0.083333
1012,this film looked promising but it was actually...,"[-0.0007334474, -0.1367768645, 0.1660933644, 0...",0,-0.000733,-0.136777,0.166093,0.122063,0.161736,0.061725,0.149064,...,-0.161892,-0.024791,-0.152552,-0.199232,-0.038998,0.109979,-0.068365,0.453943,-0.019272,-0.080246


In [148]:
df_train.dropna()

Unnamed: 0,text,label,embedding,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature760,feature761,feature762,feature763,feature764,feature765,feature766,feature767,feature768,polarity
0,fairly good romantic comedy in which i don't t...,1,"[-0.0167805497, -0.0395836979, 0.1233159453, -...",-0.016781,-0.039584,0.123316,-0.112681,0.147103,0.022566,0.053462,...,-0.195328,0.035722,-0.379844,-0.180571,-0.036396,0.114420,-0.036291,0.421711,0.060781,0.316942
1,"""dressed to kill"", is one of the best thriller...",1,"[-0.1252697259, 0.1014768854, 0.1718291789, -0...",-0.125270,0.101477,0.171829,-0.150420,0.494087,-0.016729,-0.013425,...,-0.187593,0.028957,-0.121169,-0.393291,0.200559,0.443822,-0.081122,0.346749,0.065129,0.364773
2,i'm glad that users (as of this date) who like...,1,"[0.1312361956, 0.0294876788, 0.2328549027, -0....",0.131236,0.029488,0.232855,-0.002422,0.081981,-0.063562,0.132140,...,-0.165477,0.012854,-0.191873,-0.312853,0.013078,0.150811,-0.169557,0.233549,0.052898,0.039801
3,needed an excuse to get out of the house while...,0,"[0.1387384981, 0.0460377187, 0.3447172046, -0....",0.138738,0.046038,0.344717,-0.042892,0.225556,-0.144981,0.023403,...,-0.077552,-0.013346,-0.228877,-0.144858,-0.075651,-0.076263,0.022555,0.092200,-0.068175,-0.163413
4,john candy's performance in once upon a crime ...,1,"[0.1606466323, -0.1768193543, 0.3563380837, -0...",0.160647,-0.176819,0.356338,-0.119723,0.257010,-0.037752,0.053498,...,-0.103852,0.013501,-0.221421,-0.197348,0.124097,0.090596,0.123112,0.122473,0.097609,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,"this film, i thought, was the great journey th...",1,"[-0.0127362479, -0.1083025485, 0.151258871, 0....",-0.012736,-0.108303,0.151259,0.018022,0.281201,-0.304326,-0.005685,...,-0.103134,0.062240,-0.161306,-0.152781,0.220059,0.102940,-0.042109,0.171636,0.166082,0.268056
146,i have wanted to see this for the longest time...,1,"[-0.0872147828, -0.1653934121, 0.238217473, -0...",-0.087215,-0.165393,0.238217,-0.081272,0.315752,-0.097431,-0.027156,...,-0.146171,0.079091,-0.376557,-0.266569,0.103020,0.086039,-0.091507,0.265124,-0.024546,0.357143
147,the egyptian movies has a lot of filmes with h...,0,"[0.0326799825, 0.0344635732, 0.3442422152, -0....",0.032680,0.034464,0.344242,-0.275831,0.163914,-0.428903,0.221885,...,-0.002559,0.116916,-0.336276,-0.309363,-0.234953,-0.053021,-0.354898,0.348796,-0.058987,0.196963
148,"when anti-bush jokes get really easy to do, a ...",0,"[-0.039546337, -0.04593236, 0.119179666, 0.087...",-0.039546,-0.045932,0.119180,0.087794,0.086585,-0.269455,-0.032886,...,-0.009202,0.032049,-0.106289,0.044190,0.030491,0.212227,-0.070574,0.261716,0.209701,0.194218


In [149]:
df_combined = pd.concat([df_train, full_df], ignore_index=True)
df_combined.dropna()

Unnamed: 0,text,label,embedding,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature760,feature761,feature762,feature763,feature764,feature765,feature766,feature767,feature768,polarity
0,fairly good romantic comedy in which i don't t...,1,"[-0.0167805497, -0.0395836979, 0.1233159453, -...",-0.016781,-0.039584,0.123316,-0.112681,0.147103,0.022566,0.053462,...,-0.195328,0.035722,-0.379844,-0.180571,-0.036396,0.114420,-0.036291,0.421711,0.060781,0.316942
1,"""dressed to kill"", is one of the best thriller...",1,"[-0.1252697259, 0.1014768854, 0.1718291789, -0...",-0.125270,0.101477,0.171829,-0.150420,0.494087,-0.016729,-0.013425,...,-0.187593,0.028957,-0.121169,-0.393291,0.200559,0.443822,-0.081122,0.346749,0.065129,0.364773
2,i'm glad that users (as of this date) who like...,1,"[0.1312361956, 0.0294876788, 0.2328549027, -0....",0.131236,0.029488,0.232855,-0.002422,0.081981,-0.063562,0.132140,...,-0.165477,0.012854,-0.191873,-0.312853,0.013078,0.150811,-0.169557,0.233549,0.052898,0.039801
3,needed an excuse to get out of the house while...,0,"[0.1387384981, 0.0460377187, 0.3447172046, -0....",0.138738,0.046038,0.344717,-0.042892,0.225556,-0.144981,0.023403,...,-0.077552,-0.013346,-0.228877,-0.144858,-0.075651,-0.076263,0.022555,0.092200,-0.068175,-0.163413
4,john candy's performance in once upon a crime ...,1,"[0.1606466323, -0.1768193543, 0.3563380837, -0...",0.160647,-0.176819,0.356338,-0.119723,0.257010,-0.037752,0.053498,...,-0.103852,0.013501,-0.221421,-0.197348,0.124097,0.090596,0.123112,0.122473,0.097609,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159,unbelievable!<br /><br />this film gets a 7 ou...,0,"[-0.0955021083, 0.0211753864, 0.3570575416, -0...",-0.095502,0.021175,0.357058,-0.132645,0.191091,-0.118738,0.082526,...,-0.069684,0.018771,-0.347073,-0.250546,0.129283,0.204098,-0.208941,0.457088,0.008937,-0.250000
1160,sweet romantic drama/comedy about stewart and ...,1,"[0.017505046, -0.0501609854, 0.4082049727, -0....",0.017505,-0.050161,0.408205,-0.076074,0.485385,-0.150266,0.158359,...,-0.359928,-0.083250,-0.254106,-0.380683,0.103381,0.076058,-0.203719,0.190500,-0.017864,0.340000
1161,"personally, i disdain the jerry springer show,...",1,"[-0.196471706, -0.0579777397, 0.1792553961, -0...",-0.196472,-0.057978,0.179255,-0.106598,0.059643,-0.176397,0.009714,...,0.028779,-0.088875,-0.108830,-0.082903,-0.035345,0.396249,0.212461,0.426900,0.063165,-0.083333
1162,this film looked promising but it was actually...,0,"[-0.0007334474, -0.1367768645, 0.1660933644, 0...",-0.000733,-0.136777,0.166093,0.122063,0.161736,0.061725,0.149064,...,-0.161892,-0.024791,-0.152552,-0.199232,-0.038998,0.109979,-0.068365,0.453943,-0.019272,-0.080246


In [150]:
df_combined.isnull().sum()

text          0
label         0
embedding     0
feature1      0
feature2      0
             ..
feature765    0
feature766    0
feature767    0
feature768    0
polarity      0
Length: 772, dtype: int64

In [151]:
train_and_evaluate_classifiers(df_combined[expanded_feature_columns], df_combined["label"], df_test[expanded_feature_columns], df_test["label"])

[36m[1mClassifier Evaluation Scores[0m
╒════════════════════════╤════════════╤═════════════╤══════════╤════════════╤═══════════╕
│       [32mClassifier[39m       │  [32mAccuracy[39m  │  [32mPrecision[39m  │  [32mRecall[39m  │  [32mF1 Score[39m  │  [32mAUC-ROC[39m  │
╞════════════════════════╪════════════╪═════════════╪══════════╪════════════╪═══════════╡
│  [33mLogistic Regression[39m   │  [33m0.866667[39m  │  [33m0.833333[39m   │ [33m0.902778[39m │  [33m0.866667[39m  │ [33m0.938034[39m  │
├────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────┤
│     [33mDecision Tree[39m      │    [33m0.68[39m    │  [33m0.657895[39m   │ [33m0.694444[39m │  [33m0.675676[39m  │ [33m0.680556[39m  │
├────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────┤
│     [33mRandom Forest[39m      │  [33m0.813333[39m  │    [33m0.775[39m    │ [33m0.861111[39m │  [33m0.815789[39m  │ [33m0.911414[39m  

In [152]:
def reduce_embeddings(df, feature_columns, method="tsne", n_components=2):
    df_copy = df.copy()

    if method == "tsne":
        reducer = TSNE(n_components=n_components, random_state=42)
    elif method == "pca":
        reducer = PCA(n_components=n_components, random_state=42)
    elif method == "umap":
        reducer = UMAP(n_components=n_components, random_state=42)
    elif method == "truncated_svd":
        reducer = TruncatedSVD(n_components=n_components, random_state=42)
    else:
        raise ValueError("Invalid method specified.")

    embeddings = reducer.fit_transform(df[feature_columns])

    for i in range(n_components):
        df_copy[f"{method.upper()}_{i+1}"] = embeddings[:, i]

    columns_to_drop = feature_columns
    df_copy.drop(columns=columns_to_drop, axis=1, inplace=True)

    return df_copy, reducer

def transform_embeddings(df, reducer, method="tsne", n_components=2):
    df_copy = df.copy()

    if method in ["pca", "umap", "truncated_svd"]:
        embeddings = reducer.transform(df[feature_columns])
    elif method == "tsne":
        reducer = TSNE(n_components=n_components, random_state=42)
        embeddings = reducer.fit_transform(df[feature_columns])
    else:
        raise ValueError("Invalid method specified.")

    for i in range(n_components):
        df_copy[f"{method.upper()}_{i+1}"] = embeddings[:, i]

    columns_to_drop = feature_columns
    df_copy.drop(columns=columns_to_drop, axis=1, inplace=True)

    return df_copy


In [158]:
reduced_df, reducer = reduce_embeddings(df_combined, feature_columns, method="umap", n_components=8)
reduced_test_df = transform_embeddings(df_test, reducer, method="umap", n_components=8)

train_and_evaluate_classifiers(reduced_df.drop(['label', 'text', 'embedding'],axis=1), reduced_df["label"], reduced_test_df.drop(['label', 'text', 'embedding'],axis=1), reduced_test_df["label"])

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


[36m[1mClassifier Evaluation Scores[0m
╒════════════════════════╤════════════╤═════════════╤══════════╤════════════╤═══════════╕
│       [32mClassifier[39m       │  [32mAccuracy[39m  │  [32mPrecision[39m  │  [32mRecall[39m  │  [32mF1 Score[39m  │  [32mAUC-ROC[39m  │
╞════════════════════════╪════════════╪═════════════╪══════════╪════════════╪═══════════╡
│  [33mLogistic Regression[39m   │  [33m0.773333[39m  │  [33m0.756757[39m   │ [33m0.777778[39m │  [33m0.767123[39m  │ [33m0.869124[39m  │
├────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────┤
│     [33mDecision Tree[39m      │  [33m0.686667[39m  │  [33m0.662338[39m   │ [33m0.708333[39m │  [33m0.684564[39m  │  [33m0.6875[39m   │
├────────────────────────┼────────────┼─────────────┼──────────┼────────────┼───────────┤
│     [33mRandom Forest[39m      │  [33m0.773333[39m  │  [33m0.763889[39m   │ [33m0.763889[39m │  [33m0.763889[39m  │  [33m0.83084[39m  