In [1]:
import fasttext
import pandas as pd
import spacy
import nltk
import re
import string
from nltk.corpus import stopwords
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))
stemmer = nltk.SnowballStemmer("english")

# Read the dataset with the name "Fake_Real_Data.csv" and store it in a variable df
df = pd.read_csv("C:\\Users\\Admin\\Downloads\\twitter_data.csv")
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
df["labels"] = df['class'].map({0: "Hate_Speech", 1: "Offensive_Speech", 2: "No_Hate_and_Offensive_Speech"})
df = df[["tweet", "labels"]]

def preprocess(text):
    test_list = text.split()
    text = ""
    for word in test_list:
        if word.endswith("ing"):
            word = word[:-3]
        text += word
        text += " "
    doc = nlp(text)
    filtered_tokens = []

    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

df["tweet_new"] = df.tweet.apply(preprocess)

def clean(text):
    text = str(text).lower()
    text = re.sub('[.?]', '', text)
    text = re.sub('https?://\S+|www.\S+', '', text)
    text = re.sub('<.?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w\d\w', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text

df["tweet_new"] = df.tweet.apply(clean)
df['labels'] = '__label__' + df['labels'].astype(str)
df['category_description'] = df['labels'] + ' ' + df['tweet_new']

In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['tweet_new'], df['labels'], test_size=0.2, random_state=42)

# Train a FastText model
model = fasttext.train_supervised(input="hate.train")

# Get FastText embeddings for the training and testing data
X_train_fasttext = [model.get_sentence_vector(text) for text in X_train]
X_test_fasttext = [model.get_sentence_vector(text) for text in X_test]

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [5]:
# Train an SVM classifier using FastText features
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_fasttext, y_train_encoded)
# Evaluate SVM classifier
svm_accuracy = svm_classifier.score(X_test_fasttext, y_test_encoded)
print("SVM Accuracy:", svm_accuracy)
# Predict with SVM
txt = "hi guys what's up"
txt = preprocess(txt)
txt_fasttext = model.get_sentence_vector(txt)
svm_prediction = svm_classifier.predict([txt_fasttext])
print("SVM Prediction:", label_encoder.inverse_transform(svm_prediction))

SVM Accuracy: 0.9419003429493645
SVM Prediction: ['__label__No_Hate_and_Offensive_Speech']


In [6]:
# Train a Random Forest classifier using FastText features
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_fasttext, y_train_encoded)
# Evaluate Random Forest classifier
rf_accuracy = rf_classifier.score(X_test_fasttext, y_test_encoded)
print("Random Forest Accuracy:", rf_accuracy)
# Predict with Random Forest
rf_prediction = rf_classifier.predict([txt_fasttext])
print("Random Forest Prediction:", label_encoder.inverse_transform(rf_prediction))

Random Forest Accuracy: 0.9453298365947146
Random Forest Prediction: ['__label__No_Hate_and_Offensive_Speech']


In [7]:
from sklearn.tree import DecisionTreeClassifier

model1 = DecisionTreeClassifier()
model1.fit(X_train_fasttext, y_train_encoded)
# Evaluate Random Forest classifier
accuracy = model1.score(X_test_fasttext, y_test_encoded)
print("Decision tree Accuracy:", accuracy)
# Predict with Random Forest
model_prediction = model1.predict([txt_fasttext])
print("Random Forest Prediction:", label_encoder.inverse_transform(model_prediction))

Decision tree Accuracy: 0.9122453096631027
Random Forest Prediction: ['__label__Offensive_Speech']


In [8]:
from sklearn.linear_model import LogisticRegression

model1 = LogisticRegression()
model1.fit(X_train_fasttext, y_train_encoded)
# Evaluate Random Forest classifier
accuracy = model1.score(X_test_fasttext, y_test_encoded)
print("Logistic Regression Accuracy:", accuracy)
# Predict with Random Forest
model_prediction = model1.predict([txt_fasttext])
print("Random Forest Prediction:", label_encoder.inverse_transform(model_prediction))

Logistic Regression Accuracy: 0.9437159572321969
Random Forest Prediction: ['__label__No_Hate_and_Offensive_Speech']


In [9]:
from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB()
model1.fit(X_train_fasttext, y_train_encoded)
# Evaluate Random Forest classifier
accuracy = model1.score(X_test_fasttext, y_test_encoded)
print("Naive Accuracy:", accuracy)
# Predict with Random Forest
model_prediction = model1.predict([txt_fasttext])
print("Random Forest Prediction:", label_encoder.inverse_transform(model_prediction))

ValueError: Negative values in data passed to MultinomialNB (input X)

In [10]:
from sklearn.neighbors import KNeighborsClassifier

model1 = KNeighborsClassifier(n_neighbors=5)
model1.fit(X_train_fasttext, y_train_encoded)
# Evaluate Random Forest classifier
accuracy = model1.score(X_test_fasttext, y_test_encoded)
print("Naive Accuracy:", accuracy)
# Predict with Random Forest
model_prediction = model1.predict([txt_fasttext])
print("Random Forest Prediction:", label_encoder.inverse_transform(model_prediction))

Naive Accuracy: 0.9437159572321969
Random Forest Prediction: ['__label__No_Hate_and_Offensive_Speech']


In [11]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Define individual models
decision_tree_model = DecisionTreeClassifier()
random_forest_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)  # Enable probability estimation for SVM

# Define the ensemble of models
ensemble_model = VotingClassifier(estimators=[
    ('decision_tree', decision_tree_model),
    ('random_forest', random_forest_model),
    ('svm', svm_model)
], voting='soft')  # Use soft voting for probability averaging

# Training the ensemble model
ensemble_model.fit(X_train_fasttext, y_train_encoded)

# Testing the ensemble model
y_pred_ensemble = ensemble_model.predict(X_test_fasttext)

# Accuracy Score of ensemble model
accuracy_ensemble = accuracy_score(y_test_encoded, y_pred_ensemble)
print("Ensemble Accuracy Score:", accuracy_ensemble)

Ensemble Accuracy Score: 0.9370587048618115


In [18]:
# Get input from the user
user_input = input("Enter the text you want to classify: ")

# Preprocess the user input
user_input_processed = preprocess(user_input)

# Transform the preprocessed input using the FastText model to get embeddings
user_input_embedding = model.get_sentence_vector(user_input_processed)

# Make prediction using the ensemble model
prediction = ensemble_model.predict([user_input_embedding])

# Convert the predicted label back to its original form
predicted_label = label_encoder.inverse_transform(prediction)[0]

# Print the predicted label
print("Predicted label:", predicted_label)

Enter the text you want to classify: hi sister
Predicted label: __label__No_Hate_and_Offensive_Speech
