In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import re
import pandas as pd
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score



# Load the Excel file into a DataFrame
data = pd.read_excel('NLP_Data.xlsx')
data = data.rename(columns={'Description of the Grievance': 'text', 'Grievance Category': 'category'})
data.dropna(subset = ['text'], inplace = True)
data.dropna(subset = ['category'], inplace = True)
X = data['text']
y = data['category']

# Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values)

# Drop rows with missing values
data.dropna(inplace=True)
X = data['Grievance SubCategory']


# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3834)

# Text cleaning
def clean_text(text):
    
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().strip()
    return text
data['text'] = data['text'].apply(clean_text)
data['category'] = data['category'].apply(clean_text)

# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data['text'] = data['text'].apply(lemmatize_text)
data['category'] = data['category'].apply(lemmatize_text)

# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Classification
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
accuracy



Missing Values:
text                     0
category                 0
Grievance SubCategory    0
dtype: int64


LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\KIIT/nltk_data'
    - 'C:\\Users\\KIIT\\anaconda3\\nltk_data'
    - 'C:\\Users\\KIIT\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\KIIT\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\KIIT\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
