In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,CategoricalNB,ComplementNB,GaussianNB
from sklearn.metrics import classification_report,confusion_matrix
import joblib
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv("english.csv", encoding='windows-1252')
print(df.shape)

(2101, 2)


In [5]:
X = df['title']
y = df['category']

In [4]:
help(MultinomialNB)

Help on class MultinomialNB in module sklearn.naive_bayes:

class MultinomialNB(_BaseDiscreteNB)
 |  MultinomialNB(*, alpha=1.0, force_alpha=True, fit_prior=True, class_prior=None)
 |
 |  Naive Bayes classifier for multinomial models.
 |
 |  The multinomial Naive Bayes classifier is suitable for classification with
 |  discrete features (e.g., word counts for text classification). The
 |  multinomial distribution normally requires integer feature counts. However,
 |  in practice, fractional counts such as tf-idf may also work.
 |
 |  Read more in the :ref:`User Guide <multinomial_naive_bayes>`.
 |
 |  Parameters
 |  ----------
 |  alpha : float or array-like of shape (n_features,), default=1.0
 |      Additive (Laplace/Lidstone) smoothing parameter
 |      (set alpha=0 and force_alpha=True, for no smoothing).
 |
 |  force_alpha : bool, default=True
 |      If False and alpha is less than 1e-10, it will set alpha to
 |      1e-10. If True, alpha will remain unchanged. This may cause
 | 

# naive bayes

In [6]:
# Vectorize titles
vectorizer = TfidfVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
model = MultinomialNB(alpha=0.9)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

                       precision    recall  f1-score   support

    Drainage & Sewage       0.82      0.96      0.89        53
          Electricity       0.90      0.93      0.91        57
                Other       0.84      0.76      0.80        63
Public Infrastructure       0.70      0.81      0.75        57
                 Road       0.85      0.85      0.85        61
   Sanitation & Waste       0.95      0.84      0.89        70
         Water Issues       0.87      0.78      0.82        60

             accuracy                           0.85       421
            macro avg       0.85      0.85      0.85       421
         weighted avg       0.85      0.85      0.85       421

[[51  0  0  1  0  0  1]
 [ 0 53  0  2  1  0  1]
 [ 1  2 48  8  1  2  1]
 [ 1  3  1 46  4  1  1]
 [ 0  0  2  5 52  0  2]
 [ 1  0  6  1  2 59  1]
 [ 8  1  0  3  1  0 47]]


In [None]:
# import joblib

# joblib.dump(model, 'model.pkl')
# joblib.dump(vectorizer, 'vectorizer.pkl')


In [7]:
input_title = ["school area destroyed"]
input_vec = vectorizer.transform(input_title)
predicted_category = model.predict(input_vec)

print(f"Predicted Category: {predicted_category[0]}")


Predicted Category: Water Issues


# bert

In [6]:
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Suppose X is your pandas Series of text, y is labels
X_list = X.tolist()

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode input
encoded_input = tokenizer(
    X_list,
    padding=True,
    truncation=True,
    return_tensors='pt'
)

# Pass input through BERT model

output = model(**encoded_input)

# Get the [CLS] token embedding for each sentence
# output.last_hidden_state shape: (batch_size, sequence_length, hidden_size)
# CLS token is at position 0
cls_embeddings = output.last_hidden_state[:, 0, :]

# Convert to numpy for sklearn
X_embedded = cls_embeddings.numpy()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_embedded, y, test_size=0.1, random_state=42)

# Train Logistic Regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Check accuracy
accuracy = clf.score(X_test, y_test)
print("Accuracy:", accuracy)


RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):
Traceback (most recent call last):
  File "C:\Users\salan katwal\AppData\Local\Programs\Python\Python312\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

# Spacy

In [8]:
    import spacy
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    import pandas as pd
    import numpy as np
    
    # Load SpaCy model with medium-sized word vectors
    nlp = spacy.load("en_core_web_lg")
    
    # Load data
    X_text = df["title"]
    y = df["category"]
    def get_vector(text):
        doc = nlp(text)
        vectors = [token.vector for token in doc if not token.is_stop and not token.is_punct and token.has_vector]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(nlp.vocab.vectors_length)  # fallback if no valid tokens
    
    X_vec = [get_vector(text) for text in X_text]
    
    
    
    # Convert sentences to vector averages
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.1, random_state=42)
    
    # Train classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)


  r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


In [9]:
accuracy = clf.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8862559241706162


In [10]:
# joblib.dump(clf, 'spacy_model.pkl')

['spacy_model.pkl']

In [11]:
# 
test_input = "hello"
predicted = clf.predict([nlp(test_input).vector])
print("Prediction:", predicted[0])

Prediction: Road


## RandomForestClassifier

In [34]:
rfc = RandomForestClassifier(n_estimators=100,max_depth=30,min_samples_leaf=2,min_samples_split=6,random_state=42,verbose=1,n_jobs=4)
rfc.fit(X_train,y_train)
accuracy_2 = rfc.score(X_test,y_test)
print(accuracy_2)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s


0.8483412322274881


[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [20]:
param_grid = {
    "n_estimators": [200,400,600],
    "max_depth": [10, 20, 25],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

# Set up the model and GridSearchCV
rfc = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 400}
Best CV Accuracy: 0.8671957671957672
