In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import random

In [20]:
dataset = pd.read_csv('dataset.csv')

In [21]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
all_stopwords = stopwords.words('english')

# Specify words to remove from stopwords
words_to_remove_from_stopwords = [
    'not','in',"can't",'no','nor','a','b','c','d','with','but','don', "don't",'ain',
    'aren', "aren't", 'couldn', "couldn't",'didn', "didn't", 'doesn', "doesn't",'won',
    "won't", 'wouldn', "wouldn't","why"
]

# Remove specified words from stopwords
for word in words_to_remove_from_stopwords:
    if word in all_stopwords:
        all_stopwords.remove(word)

# Process text
corpus = []
length = len(dataset['text'])
for i in range(length):
    review = re.sub('[^a-zA-Z\']', ' ', dataset['text'][i])  # Keep apostrophes
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

print(corpus[:10])  # Print the first 10 processed texts for verification
print(all_stopwords)


['absolutely', 'not', "i'm in", 'no way', 'count in', "don't want", 'sure thing', 'no thanks', 'definitely', 'not interested']
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'an', 'the', 'and', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'how', 'all', 'any', 'both', 'each', 'few', 'more

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)
y = dataset['label']

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [24]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(bootstrap = False, criterion = 'gini', max_depth = None, max_features =  'log2', min_samples_leaf = 1, min_samples_split = 2, n_estimators = 200)
classifier.fit(X_train, y_train)



In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[54  5]
 [ 0 63]]


0.9590163934426229

In [26]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Assuming X and y are your features and target labels
# Replace X and y with your actual data

# Initialize the classifier
classifier = RandomForestClassifier()

# Initialize StratifiedKFold with k=5
cv = StratifiedKFold(n_splits=10, shuffle=True)

# Perform k-fold cross-validation
accuracy_scores = cross_val_score(classifier, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

# Print the accuracy scores for each fold
print("Cross-validation accuracy scores:")
print(accuracy_scores)

# Print the mean and standard deviation of the accuracy scores
print(f"Mean accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Standard deviation: {np.std(accuracy_scores):.4f}")


Cross-validation accuracy scores:
[1.         1.         0.97560976 1.         1.         1.
 1.         1.         1.         1.        ]
Mean accuracy: 0.9976
Standard deviation: 0.0073


In [27]:
# Assuming 'test_texts' list and other necessary variables (e.g., 'vectorizer', 'classifier') are defined

test_texts = [
    "I'd love to play the game!",
    "No way, not interested.",
    "Sure, why not?",
    "Absolutely not.",
    "Count me in!",
    "I'd rather not.",
    "This sounds fun!",
    "I'm not up for it.",
    "Yes, let's do it!",
    "No, thank you.",
]


# Preprocess function
def preprocess_text(text):
    review = re.sub('[^a-zA-Z\']', ' ', text)  # Keep apostrophes
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    return review

# Predict for each text in test_texts
for text in test_texts:
    processed_text = preprocess_text(text)
    X_custom = vectorizer.transform([processed_text])

    # Check if classifier is fitted, if not, fit it
    if not hasattr(classifier, 'estimators_'):
        classifier.fit(X_train, y_train)

    # Predict class label using the trained classifier
    predicted_class = classifier.predict(X_custom)[0]

    # Print the predicted class
    print(f"Text: '{text}' -> Predicted class: {predicted_class}")


Text: 'I'd love to play the game!' -> Predicted class: yes
Text: 'No way, not interested.' -> Predicted class: no
Text: 'Sure, why not?' -> Predicted class: yes
Text: 'Absolutely not.' -> Predicted class: no
Text: 'Count me in!' -> Predicted class: yes
Text: 'I'd rather not.' -> Predicted class: no
Text: 'This sounds fun!' -> Predicted class: yes
Text: 'I'm not up for it.' -> Predicted class: no
Text: 'Yes, let's do it!' -> Predicted class: yes
Text: 'No, thank you.' -> Predicted class: no


In [33]:
import joblib
joblib.dump(classifier, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
vector = joblib.load('vectorizer.pkl')
mod = joblib.load('model.pkl')

In [37]:
mod.predict(vector.transform(["sure why not"]))

array(['yes'], dtype=object)