In [1]:
#importing libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import string
import re

In [2]:
# Load the labeled domain tweet data
data_domain = pd.read_csv('C:/Users/dell/OneDrive/Desktop/dataset/Book2.csv')

In [3]:
# Load the labeled sentiment tweet data
data_sentiment = pd.read_csv('C:/Users/dell/OneDrive/Desktop/dataset/twitter_training.csv')

In [4]:
# Data preprocessing for domain classification
def preprocess_domain_data(data):
    # Remove any rows with missing data
    data.dropna(inplace=True)

    # Encode the domain labels
    le_domain = LabelEncoder()
    data['domain_label'] = le_domain.fit_transform(data['Category'])

    # Additional preprocessing steps
    data['Tweets'] = data['Tweets'].apply(lambda x: x.lower())  # Lowercasing
    data['Tweets'] = data['Tweets'].apply(lambda x: re.sub(r'http\S+', '', x))  # Remove URLs
    data['Tweets'] = data['Tweets'].apply(lambda x: re.sub(r'@[\w_]+', '', x))  # Remove user mentions
    data['Tweets'] = data['Tweets'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))  # Removing Punctuation

    return data

In [5]:
# Data preprocessing for sentiment classification
def preprocess_sentiment_data(data):
    # Remove any rows with missing data
    data.dropna(inplace=True)

    # Encode the sentiment labels
    le_sentiment = LabelEncoder()
    data['sentiment_label'] = le_sentiment.fit_transform(data['Sentiment'])

    # Additional preprocessing steps
    data['Tweets'] = data['Tweets'].apply(lambda x: x.lower())  # Lowercasing
    data['Tweets'] = data['Tweets'].apply(lambda x: re.sub(r'http\S+', '', x))  # Remove URLs
    data['Tweets'] = data['Tweets'].apply(lambda x: re.sub(r'@[\w_]+', '', x))  # Remove user mentions
    data['Tweets'] = data['Tweets'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))  # Removing Punctuation

    return data

In [6]:
# Preprocess the domain data
data_domain = preprocess_domain_data(data_domain)

In [7]:
# Split the domain data into training and testing sets
X_domain = data_domain['Tweets']
y_domain = data_domain['domain_label']

In [8]:
X_train_domain, X_test_domain, y_train_domain, y_test_domain = train_test_split(X_domain, y_domain, test_size=0.2, random_state=42)

In [9]:
# Vectorize the text using CountVectorizer for domain classification
vectorizer_domain = CountVectorizer()
X_train_counts_domain = vectorizer_domain.fit_transform(X_train_domain)
X_test_counts_domain = vectorizer_domain.transform(X_test_domain)


In [10]:
# Create and train the Random Forest classifier for domain classification
rf_classifier_domain = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_domain.fit(X_train_counts_domain, y_train_domain)


In [11]:
# Preprocess the sentiment data
data_sentiment = preprocess_sentiment_data(data_sentiment)

In [12]:
# Split the sentiment data into training and testing sets
X_sentiment = data_sentiment['Tweets']
y_sentiment = data_sentiment['sentiment_label']

X_train_sentiment, X_test_sentiment, y_train_sentiment, y_test_sentiment = train_test_split(X_sentiment, y_sentiment, test_size=0.2, random_state=42)


In [13]:
# Vectorize the text using CountVectorizer for sentiment classification
vectorizer_sentiment = CountVectorizer()
X_train_counts_sentiment = vectorizer_sentiment.fit_transform(X_train_sentiment)
X_test_counts_sentiment = vectorizer_sentiment.transform(X_test_sentiment)

In [14]:
# Create and train the Random Forest classifier for sentiment classification
rf_classifier_sentiment = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_sentiment.fit(X_train_counts_sentiment, y_train_sentiment)

In [24]:

# Example new tweets
new_tweets = [
    "This flim is one of the best film i have seen in my life",
    "I don't like this ai product.",
    "It is just a normal game today."
]


In [25]:
# Input sentiment labels for the new tweets
user_provided_sentiments = ['positive', 'negative', 'neutral']

In [26]:
# Make predictions using the trained classifier for domain classification
new_tweets_counts_domain = vectorizer_domain.transform(new_tweets)
new_tweet_predictions_domain = rf_classifier_domain.predict(new_tweets_counts_domain)


In [27]:
# Map the numeric predictions to domain labels
label_mapping_domain = {1: 'cinema', 2: 'sports', 4: 'technology'}
predicted_labels_domain = [label_mapping_domain.get(prediction, 'unknown') for prediction in new_tweet_predictions_domain]


In [28]:
# Map user-provided sentiment labels to numeric labels
label_mapping_sentiment = {'negative': 0, 'neutral': 1, 'positive': 2} 

In [29]:
# Make predictions using the trained classifier for sentiment classification
new_tweets_counts_sentiment = vectorizer_sentiment.transform(new_tweets)
new_tweet_predictions_sentiment = [label_mapping_sentiment[sentiment] for sentiment in user_provided_sentiments]


In [30]:
# Print the predictions with domain and sentiment labels
print("The given tweets are:")
print(new_tweets)
for tweet, domain_label, sentiment in zip(new_tweets, predicted_labels_domain, user_provided_sentiments):
    print(f"Tweet: {tweet}\nDomain Label: {domain_label}\nSentiment: {sentiment}\n")


The given tweets are:
['This flim is one of the best film i have seen in my life', "I don't like this ai product.", 'It is just a normal game today.']
Tweet: This flim is one of the best film i have seen in my life
Domain Label: cinema
Sentiment: positive

Tweet: I don't like this ai product.
Domain Label: technology
Sentiment: negative

Tweet: It is just a normal game today.
Domain Label: sports
Sentiment: neutral



In [31]:
# Evaluate the performance of the classifiers

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Domain classification performance metrics
y_pred_domain = rf_classifier_domain.predict(X_test_counts_domain)
accuracy_domain = accuracy_score(y_test_domain, y_pred_domain)
precision_domain = precision_score(y_test_domain, y_pred_domain, average='weighted')
recall_domain = recall_score(y_test_domain, y_pred_domain, average='weighted')
f1_score_domain = f1_score(y_test_domain, y_pred_domain, average='weighted')

# Generate target names based on unique class labels in y_test_domain
target_names_domain = [label_mapping_domain[i] for i in np.unique(y_test_domain)]

# Generate a classification report for domain classification
report_domain = classification_report(y_test_domain, y_pred_domain, labels=np.unique(y_test_domain), target_names=target_names_domain)



# Sentiment classification performance metrics
y_pred_sentiment = rf_classifier_sentiment.predict(X_test_counts_sentiment)
accuracy_sentiment = accuracy_score(y_test_sentiment, y_pred_sentiment)
precision_sentiment = precision_score(y_test_sentiment, y_pred_sentiment, average='weighted')
recall_sentiment = recall_score(y_test_sentiment, y_pred_sentiment, average='weighted')
f1_score_sentiment = f1_score(y_test_sentiment, y_pred_sentiment, average='weighted')

# Generate target names based on label_mapping_sentiment
target_names_sentiment = list(label_mapping_sentiment.keys())


# Generate a classification report for sentiment classification
report_sentiment = classification_report(y_test_sentiment, y_pred_sentiment, labels=np.unique(y_test_sentiment), target_names=target_names_sentiment)


# Print performance metrics for domain classification
print("Domain Classification Metrics:")
print(f"Accuracy: {accuracy_domain}")
print(f"Precision: {precision_domain}")
print(f"Recall: {recall_domain}")
print(f"F1 Score: {f1_score_domain}")
print("\nClassification Report for Domain Classification:")
print(report_domain)

# Print performance metrics for sentiment classification
print("\nSentiment Classification Metrics:")
print(f"Accuracy: {accuracy_sentiment}")
print(f"Precision: {precision_sentiment}")
print(f"Recall: {recall_sentiment}")
print(f"F1 Score: {f1_score_sentiment}")
print("\nClassification Report for Sentiment Classification:")
print(report_sentiment)


Domain Classification Metrics:
Accuracy: 0.9358974358974359
Precision: 0.9383955505279035
Recall: 0.9358974358974359
F1 Score: 0.9362215572692212

Classification Report for Domain Classification:
              precision    recall  f1-score   support

      cinema       0.96      0.91      0.94        90
      sports       0.88      0.96      0.92        78
  technology       0.97      0.94      0.95        66

    accuracy                           0.94       234
   macro avg       0.94      0.94      0.94       234
weighted avg       0.94      0.94      0.94       234


Sentiment Classification Metrics:
Accuracy: 0.9281032502196095
Precision: 0.9289374416432975
Recall: 0.9281032502196095
F1 Score: 0.928074709069852

Classification Report for Sentiment Classification:
              precision    recall  f1-score   support

    negative       0.95      0.91      0.93      4463
     neutral       0.91      0.95      0.93      6213
    positive       0.94      0.90      0.92      4123

   