In [1]:
import spacy
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pandas as pd
import nltk
import re,string
nltk.download('stopwords')
from nltk.corpus import stopwords
stopword=set(stopwords.words('english'))
stemmer = nltk.SnowballStemmer("english")

#read the dataset with name "Fake_Real_Data.csv" and store it in a variable df
df = pd.read_csv("C:\\Users\\Admin\\Downloads\\twitter_data.csv")

#print the shape of dataframe
print(df.shape)

#print top 5 rows
df.head(5)



(24783, 7)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
df["labels"] = df['class'].map({0: "Hate Speech", 1: "Offensive Speech", 2: "No Hate and Offensive Speech"})
df = df[["tweet", "labels"]]

In [4]:
 def preprocess(text):
        test_list = text.split()
        text = ""
        for word in test_list:
            if word.endswith("ing"):
                word=word[:-3]
            text+=word
            text+=" "
        doc = nlp(text)
        filtered_tokens = []
        
        for token in doc:
            if token.is_stop or token.is_punct:
                continue
            filtered_tokens.append(token.lemma_)
            
        return " ".join(filtered_tokens)

In [5]:
df["tweet_new"] = df.tweet.apply(preprocess)

In [6]:
df.head()

Unnamed: 0,tweet,labels,tweet_new
0,!!! RT @mayasolovely: As a woman you shouldn't...,No Hate and Offensive Speech,RT @mayasolovely woman complain clean house am...
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Offensive Speech,RT @mleew17 boy dat cold tyga dwn bad cuffin d...
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Offensive Speech,RT @UrKindOfBrand Dawg RT @80sbaby4life fuck b...
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Offensive Speech,RT @C_G_Anderson @viva_base look like tranny
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Offensive Speech,RT @shenikaroberts shit hear true faker bitch ...


In [7]:
def clean(text):
    text = str (text). lower()
    text = re. sub('[.?]', '', text)
    text = re. sub('https?://\S+|www.\S+', '', text)
    text = re. sub('<.?>+', '', text)
    text = re. sub('[%s]' % re. escape(string. punctuation), '', text)
    text = re. sub('\n', '', text)
    text = re. sub('\w\d\w', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ". join(text)
    text = [stemmer. stem(word) for word in text. split(' ')]
    text=" ". join(text)
    return text

In [8]:
df["tweet_new"] = df.tweet_new.apply(clean)

In [9]:
x = np. array(df["tweet_new"])
y = np. array(df["labels"])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [10]:
def preprocess_text_with_spacy(text):
    doc = nlp(text)
    words = [word.text for word in doc if not word.is_punct and not word.is_space]
    return words

In [11]:
# Preprocess and create TaggedDocuments
tagged_data = [
    TaggedDocument(preprocess_text_with_spacy(X_train[i]), [str(i)])
    for i in range(len(X_train))
]

In [12]:
# Train a Doc2Vec model on the tagged data
doc2vec_model = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [13]:
# Convert training data to Doc2Vec vectors
X_train_doc2vec = [doc2vec_model.infer_vector(preprocess_text_with_spacy(sentence)) for sentence in X_train]

In [14]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

In [15]:
X_train_d2v, X_test_d2v, y_train_enc, y_test_enc = train_test_split(X_train_doc2vec, y_train_encoded, test_size=0.2, random_state=42)

In [16]:
clf = SVC()
clf.fit(X_train_d2v, y_train_enc)

#Testing the model
y_pred = clf.predict (X_test_d2v)

#Accuracy Score of our model
from sklearn.metrics import accuracy_score
print (accuracy_score (y_test_enc,y_pred))

new_string = "hi"
new_string = preprocess(new_string)

# Convert the new input to a Doc2Vec vector
new_string_vector = doc2vec_model.infer_vector(preprocess_text_with_spacy(new_string))

# Predict the label
predicted_label_encoded = clf.predict([new_string_vector])
predicted_label = label_encoder.inverse_transform(predicted_label_encoded)

print("Predicted Label:", predicted_label)

0.8482384823848238
Predicted Label: ['Offensive Speech']


In [17]:
from sklearn.tree import DecisionTreeClassifier

#Model building
model = DecisionTreeClassifier()
#Training the model
model.fit(X_train_d2v, y_train_enc)
#Testing the model
y_pred = model.predict (X_test_d2v)
#Accuracy Score of our model
from sklearn.metrics import accuracy_score
print (accuracy_score (y_test_enc,y_pred))

new_string = "hi"
new_string = preprocess(new_string)

# Convert the new input to a Doc2Vec vector
new_string_vector = doc2vec_model.infer_vector(preprocess_text_with_spacy(new_string))

# Predict the label
predicted_label_encoded = clf.predict([new_string_vector])
predicted_label = label_encoder.inverse_transform(predicted_label_encoded)

print("Predicted Label:", predicted_label)

0.7383318277627221
Predicted Label: ['Offensive Speech']


In [18]:
from sklearn.naive_bayes import MultinomialNB

#Model building
model = MultinomialNB()
#Training the model
model.fit(X_train_d2v, y_train_enc)
#Testing the model
y_pred = model.predict (X_test_d2v)
#Accuracy Score of our model
from sklearn.metrics import accuracy_score
print (accuracy_score (y_test_enc,y_pred))

new_string = "hi"
new_string = preprocess(new_string)

# Convert the new input to a Doc2Vec vector
new_string_vector = doc2vec_model.infer_vector(preprocess_text_with_spacy(new_string))

# Predict the label
predicted_label_encoded = clf.predict([new_string_vector])
predicted_label = label_encoder.inverse_transform(predicted_label_encoded)

print("Predicted Label:", predicted_label)

ValueError: Negative values in data passed to MultinomialNB (input X)

In [19]:
from sklearn.ensemble import RandomForestClassifier

#Model building
model = RandomForestClassifier()
#Training the model
model.fit(X_train_d2v,y_train_enc)
#Testing the model
y_pred = model.predict (X_test_d2v)
#Accuracy Score of our model
from sklearn.metrics import accuracy_score
print (accuracy_score (y_test_enc,y_pred))
#Predicting the outcome
new_string = "hi"
new_string = preprocess(new_string)

# Convert the new input to a Doc2Vec vector
new_string_vector = doc2vec_model.infer_vector(preprocess_text_with_spacy(new_string))

# Predict the label
predicted_label_encoded = clf.predict([new_string_vector])
predicted_label = label_encoder.inverse_transform(predicted_label_encoded)

print("Predicted Label:", predicted_label)

0.8470340258958146
Predicted Label: ['Offensive Speech']


In [20]:
from sklearn.ensemble import VotingClassifier

# Initialize individual models
dt_model = DecisionTreeClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)  # Enable probability estimation for SVM

# Define the ensemble of models
ensemble_model = VotingClassifier(estimators=[
    ('decision_tree', dt_model),
    ('random_forest', rf_model),
    ('svm', svm_model)
], voting='soft')  # Use soft voting for probability averaging

# Training the ensemble model
ensemble_model.fit(X_train_d2v, y_train_enc)

# Testing the ensemble model
y_pred_ensemble = ensemble_model.predict(X_test_d2v)

# Accuracy Score of ensemble model
print("Ensemble Accuracy Score:", accuracy_score(y_test_enc, y_pred_ensemble))

Ensemble Accuracy Score: 0.8331827762722072


In [24]:
# Get input from the user
user_input = input("Enter the text you want to classify: ")

# Preprocess the user input
user_input_processed = preprocess(user_input)

user_input_vectorized = doc2vec_model.infer_vector(preprocess_text_with_spacy(user_input_processed))

# Make prediction using the ensemble model
predicted_label_encoded = ensemble_model.predict([new_string_vector])
predicted_label = label_encoder.inverse_transform(predicted_label_encoded)

# Print the predicted label
print("Predicted label:", predicted_label)

Enter the text you want to classify: good
Predicted label: ['Offensive Speech']
