In [None]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-24.1.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.1.2


**MUTLICLASS SENTIMENT CLASSIFICATION WITH RANDOFOREST**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# Load the train and test datasets
train_file = '/content/train.csv'
test_file = '/content/test.csv'

# Read the datasets
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

# Ensure there are no leading/trailing spaces in column names
df_train.columns = df_train.columns.str.strip().str.replace('"', '')
df_test.columns = df_test.columns.str.strip().str.replace('"', '')

# Define the target columns and prepare the data
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X_train = df_train['comment_text'].fillna('').astype(str)  # Replace NaN in comments with empty string and ensure all are strings
X_test = df_test['comment_text'].fillna('').astype(str)

# Ensure the target columns are present in the training dataset
for col in target_columns:
    if col not in df_train.columns:
        raise KeyError(f"Column {col} not found in train dataset")

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Combine the target columns into a single column for multiclass classification
df_train['combined_target'] = df_train[target_columns].astype(str).agg('-'.join, axis=1)

y_train = df_train['combined_target']

# Initialize and train a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_tfidf, y_train)

# Predict on the test dataset
y_pred = clf.predict(X_test_tfidf)

# Create a DataFrame for the predictions
df_predictions = pd.DataFrame({'id': df_test['id'], 'predicted': y_pred})

# Save the predictions to a CSV file
df_predictions.to_csv('random_forest_multiclass_predictions.csv', index=False)

print("Predictions saved to random_forest_multiclass_predictions.csv")


Predictions saved to random_forest_multiclass_predictions.csv


**Formating the Results**

In [None]:
import pandas as pd

# Load the predictions
predictions_file = '/content/random_forest_multiclass_predictions.csv'
df_predictions = pd.read_csv(predictions_file)

# Display the first few rows of the predictions to understand the output
print(df_predictions.head())

# Split the combined predictions back into individual labels
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df_predictions[target_columns] = df_predictions['predicted'].str.split('-', expand=True)

# Convert the split columns back to integer type
for col in target_columns:
    df_predictions[col] = df_predictions[col].astype(int)

# Display the first few rows of the predictions with individual labels
print(df_predictions.head())

# Analyze the distribution of predictions for each label
print(df_predictions[target_columns].sum())

# Save the detailed predictions to a new CSV file
df_predictions.to_csv('detailed_multiclass_predictions.csv', index=False)

print("Detailed predictions saved to detailed_multiclass_predictions.csv")


                 id    predicted
0  00001cee341fdb12  1-0-1-0-1-0
1  0000247867823ef7  0-0-0-0-0-0
2  00013b17ad220c46  0-0-0-0-0-0
3  00017563c3f7919a  0-0-0-0-0-0
4  00017695ad8997eb  0-0-0-0-0-0
                 id    predicted  toxic  severe_toxic  obscene  threat  \
0  00001cee341fdb12  1-0-1-0-1-0      1             0        1       0   
1  0000247867823ef7  0-0-0-0-0-0      0             0        0       0   
2  00013b17ad220c46  0-0-0-0-0-0      0             0        0       0   
3  00017563c3f7919a  0-0-0-0-0-0      0             0        0       0   
4  00017695ad8997eb  0-0-0-0-0-0      0             0        0       0   

   insult  identity_hate  
0       1              0  
1       0              0  
2       0              0  
3       0              0  
4       0              0  
toxic            16601
severe_toxic       691
obscene          13563
threat              97
insult           11442
identity_hate      387
dtype: int64
Detailed predictions saved to detailed_multi