In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing
import matplotlib.pyplot as plt #visuals
import seaborn as sns # modelling

train = pd. read_excel(r'C:\Users\Admin\Desktop\new data\train.xlsx')
predict = pd. read_excel(r'C:\Users\Admin\Desktop\new data\predict.xlsx')



In [None]:
# Display first five rows
display(train.head(15))

# Info
print(train.info())

In [None]:
# Get lengths
train["length"] = train["review"].str.len()

# Get word counts
train["word_count"] = train["review"].str.split().apply(len)

# Display the two columns
display(train[["length", "word_count"]])

# Look at the distribution of length and word_count
sns.distplot(train["length"], bins=10)
plt.title("Distribution of review lengths")
plt.show()

sns.distplot(train["word_count"], bins=10)
plt.title("Distribution of word counts")
plt.show()


# Print 10 bins of length column
print(pd.cut(train['length'], 10).value_counts())

# Print 10 bins of word_count column
print(pd.cut(train['word_count'], 10).value_counts())

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Import Snowballstemmer for stemming Emglish words
from snowballstemmer import EnglishStemmer

# Initialize the stemmer
en_stemmer = EnglishStemmer()

# stop words
en_stopwords = stopwords.words("english")
ext_stopwords = ["ni", "jp", "@", "na", "paka", "kumi", "i", "nini", "nyinyi", "rada",
                 "hii", "this", "coz", "my", "to", "me", "yote", "you", "jackpot", "is", "a"]
full_stopwords = en_stopwords+ext_stopwords

def tokenize(review):

    # Tokenize the review
    tokenized = word_tokenize(review, preserve_line=True)

    # Remove the stopwords
    tokenized = [token for token in tokenized if token not in full_stopwords]

    # Stemming the tokens
    tokenized = [en_stemmer.stemWord(token) for token in tokenized]

    return tokenized

# Apply the function
train["Tokenized"] = train["review"].str.lower().apply(tokenize)
predict["Tokenized"] = predict["review"].str.lower().apply(tokenize)

# See the result
display(train["Tokenized"].head(15))

In [None]:
# Import TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert tokenized words from list to string
train['tokenized_str']=[" ".join(token) for token in train['Tokenized'].values]

# Initialize a Tf-idf Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer
tfidf_matrix = vectorizer.fit_transform(train["tokenized_str"])

# Let's see what we have
display(tfidf_matrix)

# Create a DataFrame for tf-idf vectors and display the first five rows
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns= vectorizer.get_feature_names_out())

# Display the first five rows of the tfidf DataFrame
display(tfidf_df.head())

In [None]:
# Import necessary tools from sklearn
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Select the features and the target
X = train['tokenized_str']
y = train["code"]

# Split the train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34, stratify=y)

In [None]:
#Importing necessary libraries
import nltk
import pandas as pd
from textblob import Word
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split 
#Loading the dataset
data = pd.read_csv('Finance_data.csv')
#Pre-Processing the text 
def cleaning(df, stop_words):
    df['sentences'] = df['sentences'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    # Replacing the digits/numbers
    df['sentences'] = df['sentences'].str.replace('d', '')
    # Removing stop words
    df['sentences'] = df['sentences'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    # Lemmatization
    df['sentences'] = df['sentences'].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))
    return df
stop_words = stopwords.words('english')
data_cleaned = cleaning(data, stop_words)
#Generating Embeddings using tokenizer
tokenizer = Tokenizer(num_words=500, split=' ') 
tokenizer.fit_on_texts(data_cleaned['verified_reviews'].values)
X = tokenizer.texts_to_sequences(data_cleaned['verified_reviews'].values)
X = pad_sequences(X)
#Model Building
model = Sequential()
model.add(Embedding(500, 120, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(352, activation='LeakyReLU'))
model.add(Dense(3, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())
#Model Training
model.fit(X_train, y_train, epochs = 20, batch_size=32, verbose =1)
#Model Testing
model.evaluate(X_test,y_test)

In [None]:
# Create the tf-idf vectorizer
model_vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer with X_train
tfidf_train = model_vectorizer.fit_transform(X_train)

# Tranform the vectorizer with X_test
tfidf_test = model_vectorizer.transform(X_test)

# Initialize the Bernoulli Naive Bayes classifier
nb = BernoulliNB()

# Fit the model
nb.fit(tfidf_train, y_train)

# Print the accuracy score
best_accuracy = cross_val_score(nb, tfidf_test, y_test, cv=10, scoring='accuracy').max()
print("Accuracy:",best_accuracy)

# Predict the labels
y_pred = nb.predict(tfidf_test)

# Print the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix\n")
print(cm)

# Print the Classification Report
cr = classification_report(y_test, y_pred)
print("\n\nClassification Report\n")
print(cr)

In [None]:
# Convert tokenized words from list to string
predict['tokenized_str']=[" ".join(token) for token in predict['Tokenized'].values]

# Look at the test data
display(predict.head(10))
print(predict.info())

In [None]:
# Get the tfidf of predict data
tfidf_final = model_vectorizer.transform(predict["tokenized_str"])

# Predict the labels
y_pred_final = nb.predict(tfidf_final)
predict["code"] = y_pred_final
print(predict)
