In [None]:
import pandas as pd
import numpy as np

In [None]:
sentences_df = pd.read_csv('/content/drive/MyDrive/training-english/sentences.tsv',sep='\t')
labels_df = pd.read_csv('/content/drive/MyDrive/training-english/labels.tsv',sep='\t')

In [None]:
sentences_df.head()

Unnamed: 0,Text-ID,Sentence-ID,Text
0,EN_001,1,Hispanic Voters Are Losing Faith In The Democr...
1,EN_001,2,The support of Hispanic voters at the midterms...
2,EN_001,3,U.S. President Joe Biden speaks to employees a...
3,EN_001,4,(Julie Bennett/Getty Images) According to a Qu...
4,EN_001,5,This marks the lowest approval rating of any d...


In [None]:
sentences_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44758 entries, 0 to 44757
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text-ID      44758 non-null  object
 1   Sentence-ID  44758 non-null  int64 
 2   Text         44758 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [None]:
sentences_df.shape

(44758, 3)

In [None]:
labels_df.shape

(44758, 41)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load the data (replace with your file paths if necessary)

# Merge the two datasets on 'Text-ID' and 'Sentence-ID'
merged_df = pd.merge(sentences_df, labels_df, on=['Text-ID', 'Sentence-ID'])

# Define the text preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Join tokens back into a single string
    return ' '.join(tokens)

# Apply the preprocessing function to the 'Text' column
merged_df['cleaned_text'] = merged_df['Text'].apply(preprocess_text)

# Check the first few rows of the processed text
print(merged_df[['Text', 'cleaned_text']].head())

# Save the preprocessed data (if needed)
merged_df.to_csv('preprocessed_data.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                                Text  \
0  Hispanic Voters Are Losing Faith In The Democr...   
1  The support of Hispanic voters at the midterms...   
2  U.S. President Joe Biden speaks to employees a...   
3  (Julie Bennett/Getty Images) According to a Qu...   
4  This marks the lowest approval rating of any d...   

                                        cleaned_text  
0  hispanic voter losing faith democratic party poll  
1  support hispanic voter midterm later year coul...  
2  u president joe biden speaks employee lockheed...  
3  julie bennettgetty image according quinnipiac ...  
4      mark lowest approval rating demographic group  


In [None]:
# Install transformers and pandas if not already installed
!pip install transformers pandas

# Import libraries
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
import numpy as np  # Import NumPy

# Function to get BERT embeddings for a batch of sentences
def get_bert_embeddings(sentences, batch_size=8):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to(device)
    embeddings = []

    # Process sentences in smaller batches
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden_states = outputs.last_hidden_state

        # Sentence-level embeddings (mean of token embeddings)
        batch_embeddings = last_hidden_states.mean(dim=1).cpu().numpy()
        embeddings.append(batch_embeddings)

        # Clear unused GPU memory
        torch.cuda.empty_cache()

    return np.vstack(embeddings)  # Combine all batch embeddings into a single array

# Check if GPU is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load your dataset
# dataset = pd.read_csv('sentences.csv')  # Adjust the file path if necessary
sentences = sentences_df['Text'].tolist()  # Replace 'text' with your actual column name

# Get BERT embeddings with reduced batch size
bert_embeddings = get_bert_embeddings(sentences, batch_size=8)  # Adjust batch size as needed
print("BERT Embeddings shape:", bert_embeddings.shape)

# Monitor GPU memory usage
print(torch.cuda.memory_summary(device, abbreviated=False))




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Embeddings shape: (44758, 768)
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   8320 KiB | 491931 KiB |   2134 GiB |   2134 GiB |
|       from large pool |   8320 KiB | 490880 KiB |   1530 GiB |   1530 GiB |
|       from small pool |      0 KiB |  10156 KiB |    603 GiB |    603 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   8320 KiB | 491931 KiB |   2134 GiB |   2134 GiB |
|       from large pool |   8320 KiB | 490880 KiB |   1530 GiB |   1530 GiB |
|       from small pool |      0 KiB |  10156 KiB |    603 GiB |    603 GiB |
|---------------------------

In [None]:
np.save('bert_embeddings.npy', bert_embeddings)


In [None]:
loaded_embeddings = np.load('bert_embeddings.npy')
print("Loaded embeddings shape:", loaded_embeddings.shape)


Loaded embeddings shape: (44758, 768)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
import joblib

sentences_df = pd.read_csv('/content/drive/MyDrive/training-english/sentences.tsv',sep='\t')

# Load labels from labels.tsv and select the score columns only
labels_file_path = '/content/drive/MyDrive/training-english/labels.tsv'
labels_df = pd.read_csv(labels_file_path, sep='\t')

# Remove non-label columns ('Text-ID' and 'Sentence-ID')
label_columns = labels_df.columns[2:]  # Exclude 'Text-ID' and 'Sentence-ID'

# Assign each sentence to the category with the highest score
labels_df['dominant_label'] = labels_df[label_columns].idxmax(axis=1)

# Convert categorical labels to numeric encoding for model training
labels_df['dominant_label'] = labels_df['dominant_label'].astype('category').cat.codes
labels = labels_df['dominant_label'].values  # This is our single-label target array

# Create DataFrame for features and labels
data = pd.DataFrame(sentences_df)
data['label'] = labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['label']),
    data['label'],
    test_size=0.2,
    random_state=42
)

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted F1 score
print("Accuracy:", accuracy)
print("F1 Score (weighted):", f1)
print(classification_report(y_test, y_pred))


Accuracy: 0.48815907059874886
F1 Score (weighted): 0.3608412568898894
              precision    recall  f1-score   support

           0       0.29      0.04      0.07       402
           1       0.21      0.04      0.07       140
           2       0.21      0.04      0.07       143
           3       0.00      0.00      0.00        18
           4       0.10      0.02      0.03       105
           5       1.00      0.03      0.06        32
           6       0.00      0.00      0.00        45
           7       0.00      0.00      0.00        44
           8       0.29      0.11      0.15       285
           9       0.14      0.01      0.02       166
          10       0.00      0.00      0.00        78
          11       0.10      0.02      0.04        85
          12       0.00      0.00      0.00        46
          13       0.00      0.00      0.00        19
          14       0.00      0.00      0.00        14
          15       0.00      0.00      0.00         6
          1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
import joblib

sentences_df = pd.read_csv('/content/drive/MyDrive/training-english/sentences.tsv',sep='\t')

# Load labels from labels.tsv and select the score columns only
labels_file_path = '/content/drive/MyDrive/training-english/labels.tsv'
labels_df = pd.read_csv(labels_file_path, sep='\t')

# Remove non-label columns ('Text-ID' and 'Sentence-ID')
label_columns = labels_df.columns[2:]  # Exclude 'Text-ID' and 'Sentence-ID'

# Assign each sentence to the category with the highest score
labels_df['dominant_label'] = labels_df[label_columns].idxmax(axis=1)

# Convert categorical labels to numeric encoding for model training
labels_df['dominant_label'] = labels_df['dominant_label'].astype('category').cat.codes
labels = labels_df['dominant_label'].values  # This is our single-label target array

# Create DataFrame for features and labels
data = pd.DataFrame(sentences_df)
data['label'] = labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['label']),
    data['label'],
    test_size=0.2,
    random_state=42
)

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted F1 score
print("Accuracy:", accuracy)
print("F1 Score:", f1)

# Get the classification report as a dictionary
report = classification_report(y_test, y_pred, output_dict=True)

# Extract the support values from the report
support = {label: report[label]['support'] for label in report if label.isdigit()}

# Print the support values
# print("\nSupport values for each class:")
# for label, value in support.items():
#     print(f"Class {label}: {value}")



Accuracy: 0.4884941912421805
F1 Score (weighted): 0.36096117284341306

Support values for each class:
Class 0: 402.0
Class 1: 140.0
Class 2: 143.0
Class 3: 18.0
Class 4: 105.0
Class 5: 32.0
Class 6: 45.0
Class 7: 44.0
Class 8: 285.0
Class 9: 166.0
Class 10: 78.0
Class 11: 85.0
Class 12: 46.0
Class 13: 19.0
Class 14: 14.0
Class 15: 6.0
Class 16: 324.0
Class 17: 62.0
Class 18: 286.0
Class 19: 137.0
Class 20: 60.0
Class 21: 97.0
Class 22: 313.0
Class 23: 412.0
Class 24: 282.0
Class 25: 48.0
Class 26: 4392.0
Class 27: 23.0
Class 28: 199.0
Class 29: 21.0
Class 30: 86.0
Class 31: 15.0
Class 32: 247.0
Class 33: 119.0
Class 34: 106.0
Class 35: 27.0
Class 36: 34.0
Class 37: 34.0
Model saved as 'emotion_detection_model.joblib'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression

# Load sentence and label data
sentences_df = pd.read_csv('/content/drive/MyDrive/training-english/sentences.tsv', sep='\t')
labels_df = pd.read_csv('/content/drive/MyDrive/training-english/labels.tsv', sep='\t')

# Load precomputed BERT embeddings (assuming they are in a file like embeddings.npy or similar)
bert_embeddings = np.load('/content/drive/MyDrive/bert_embeddings.npy')  # Change the path accordingly

# Preprocessing: Merging data and selecting the dominant label
label_columns = labels_df.columns[2:]  # Exclude 'Text-ID' and 'Sentence-ID'
labels_df['dominant_label'] = labels_df[label_columns].idxmax(axis=1)
# Convert dominant labels to numerical categories
labels_df['dominant_label'] = labels_df['dominant_label'].astype('category').cat.codes

# Merge text and labels data
merged_df = sentences_df.merge(labels_df[['Text-ID', 'Sentence-ID', 'dominant_label']], on=['Text-ID', 'Sentence-ID'])
labels = merged_df['dominant_label'].values

# Split data into training and validation sets
train_embeddings, val_embeddings, train_labels, val_labels = train_test_split(bert_embeddings, labels, test_size=0.2, random_state=42)

# Convert to torch tensors
train_embeddings = torch.tensor(train_embeddings)
val_embeddings = torch.tensor(val_embeddings)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Define the classifier (e.g., a logistic regression model)
classifier = LogisticRegression(max_iter=1000)

# Fit the classifier on the training data
classifier.fit(train_embeddings.numpy(), train_labels.numpy())

# Evaluate the classifier on the validation data
val_preds = classifier.predict(val_embeddings.numpy())

# Accuracy score
accuracy = accuracy_score(val_labels.numpy(), val_preds)

# Classification report for detailed performance metrics (precision, recall, f1-score, support)
report = classification_report(val_labels.numpy(), val_preds, target_names=[str(i) for i in range(len(np.unique(labels)))], zero_division=0)

# Extract the support values from the classification report
report_lines = report.split('\n')
class_supports = {}

# Iterate through the lines and extract support values for each class
for line in report_lines[2:-3]:  # Exclude header and footer lines
    parts = line.split()
    class_name = parts[0]  # Class name
    support = parts[-1]  # Support value is the last column
    class_supports[class_name] = int(support)

print(f"Accuracy: {accuracy}")
print("Support values for each class:")
for class_name, support in class_supports.items():
    print(f"Class {class_name}: {support}")


KeyboardInterrupt: 