In [1]:
import pandas as pd
import numpy as np

In [2]:
ds=pd.read_csv("langdatabalanced.csv")

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import *
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 

In [7]:
# Assuming 'short_description' is the column you want to use for features
x_train, x_test, y_train, y_test = train_test_split(ds['short_description'],
                                                    ds.category,
                                                    test_size=0.2,
                                                    random_state=2022,
                                                    stratify=ds.category)

In [13]:
classifiers = {
    'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='auto', p=2),
    'DecisionTreeClassifier': DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=4),
    'LogisticRegression': LogisticRegression(),
    'LinearSVM': LinearSVC()
}

# Loop through classifiers and train each
for name, clf in classifiers.items():
    print(f"\nTraining with {name}")
    
    # Create pipeline for each classifier
    pipeline = Pipeline([
        ('vectorizer_bow', CountVectorizer(ngram_range=(1, 1))),
        ('classifier', clf)
    ])
    
    # Train the model
    pipeline.fit(x_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(x_test)
    
    # Print classification report
    print(f"Results for {name}:")
    print(classification_report(y_test, y_pred,zero_division=0))



Training with KNeighborsClassifier
Results for KNeighborsClassifier:
                precision    recall  f1-score   support

          ARTS       0.04      0.01      0.01       169
ARTS & CULTURE       0.04      0.12      0.07       169
  BLACK VOICES       0.06      0.07      0.06       169
      BUSINESS       0.02      0.05      0.03       169
       COLLEGE       0.15      0.21      0.17       168
        COMEDY       0.05      0.22      0.08       169
         CRIME       0.10      0.09      0.10       169
     EDUCATION       0.14      0.02      0.03       168
 ENTERTAINMENT       0.05      0.08      0.06       169
         FIFTY       0.00      0.00      0.00       169
     GOOD NEWS       0.03      0.08      0.05       169
         GREEN       0.05      0.01      0.02       169
HEALTHY LIVING       0.07      0.06      0.06       168
        IMPACT       0.03      0.01      0.01       169
 LATINO VOICES       0.07      0.05      0.06       168
         MEDIA       0.16      0.

In [15]:
classifiers = {
    'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='auto', p=2),
    'DecisionTreeClassifier': DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=4),
    'LogisticRegression': LogisticRegression(),
    'LinearSVM': LinearSVC()
}

# Loop through classifiers and train each
for name, clf in classifiers.items():
    print(f"\nTraining with {name}")
    
    # Create pipeline for each classifier
    pipeline = Pipeline([
        ('TFIDFvectorizer', TfidfVectorizer(ngram_range=(1, 1))),
        ('classifier', clf)
    ])
    
    # Train the model
    pipeline.fit(x_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(x_test)
    
    # Print classification report
    print(f"Results for {name}:")
    print(classification_report(y_test, y_pred,zero_division=0))


Training with KNeighborsClassifier
Results for KNeighborsClassifier:
                precision    recall  f1-score   support

          ARTS       0.47      0.08      0.14       169
ARTS & CULTURE       0.07      0.08      0.07       169
  BLACK VOICES       0.12      0.11      0.11       169
      BUSINESS       0.21      0.15      0.17       169
       COLLEGE       0.23      0.18      0.20       168
        COMEDY       0.07      0.15      0.10       169
         CRIME       0.23      0.23      0.23       169
     EDUCATION       0.39      0.23      0.29       168
 ENTERTAINMENT       0.09      0.12      0.11       169
         FIFTY       0.36      0.07      0.12       169
     GOOD NEWS       0.11      0.15      0.13       169
         GREEN       0.30      0.17      0.22       169
HEALTHY LIVING       0.14      0.15      0.14       168
        IMPACT       0.08      0.04      0.05       169
 LATINO VOICES       0.16      0.14      0.15       168
         MEDIA       0.22      0.

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report





# Logistic Regression pipeline with scaling
pipeline_logistic = Pipeline([
    ('TFIDFvectorizer', TfidfVectorizer()),
    #('scaler', StandardScaler(with_mean=False)),  # Scaling for text data
    ('classifier', LogisticRegression())
])
# Define parameter grids with higher max_iter
param_grid_logistic = {
    'TFIDFvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],  # Unigram, bigram, trigram
    'classifier__C': [0.01, 0.1, 1, 10],  # Regularization strength
    'classifier__solver': ['liblinear', 'saga']  # Different solvers
    #'classifier__max_iter': [1000, 2000, 5000]  # Increase max_iter to allow more iterations
}
# GridSearchCV for Logistic Regression
grid_search_logistic = GridSearchCV(pipeline_logistic, param_grid_logistic, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_logistic.fit(x_train, y_train)
# LinearSVC pipeline with scaling
# Get the best parameters and print classification reports
print("\nBest parameters for Logistic Regression:")
print(grid_search_logistic.best_params_)
y_pred_logistic = grid_search_logistic.predict(x_test)
print(classification_report(y_test, y_pred_logistic, zero_division=0))



Best parameters for Logistic Regression:
{'TFIDFvectorizer__ngram_range': (1, 2), 'classifier__C': 10, 'classifier__solver': 'liblinear'}
                precision    recall  f1-score   support

          ARTS       0.44      0.51      0.48       169
ARTS & CULTURE       0.29      0.18      0.22       169
  BLACK VOICES       0.33      0.22      0.26       169
      BUSINESS       0.35      0.42      0.38       169
       COLLEGE       0.47      0.48      0.47       168
        COMEDY       0.21      0.16      0.18       169
         CRIME       0.36      0.53      0.43       169
     EDUCATION       0.55      0.61      0.58       168
 ENTERTAINMENT       0.16      0.12      0.14       169
         FIFTY       0.29      0.49      0.37       169
     GOOD NEWS       0.18      0.16      0.17       169
         GREEN       0.36      0.38      0.37       169
HEALTHY LIVING       0.23      0.26      0.24       168
        IMPACT       0.26      0.25      0.26       169
 LATINO VOICES      

In [30]:
# Update the pipeline with the best parameters
best_pipeline_lr = Pipeline([
    ('TFIDFvectorizer', TfidfVectorizer(ngram_range=(1, 3))),
    ('classifier', LogisticRegression(C=10, solver='liblinear'))
])

# Fit the best pipeline on the training data
best_pipeline_lr.fit(x_train, y_train)

# Predict on the test set
y_pred_best_lr = best_pipeline_lr.predict(x_test)

# Print the classification report for the best Logistic Regression model
print(classification_report(y_test, y_pred_best_lr, zero_division=0))

                precision    recall  f1-score   support

          ARTS       0.42      0.53      0.47       169
ARTS & CULTURE       0.29      0.15      0.20       169
  BLACK VOICES       0.36      0.23      0.28       169
      BUSINESS       0.37      0.42      0.39       169
       COLLEGE       0.46      0.45      0.45       168
        COMEDY       0.20      0.15      0.17       169
         CRIME       0.37      0.53      0.44       169
     EDUCATION       0.54      0.62      0.57       168
 ENTERTAINMENT       0.17      0.12      0.14       169
         FIFTY       0.24      0.51      0.33       169
     GOOD NEWS       0.22      0.20      0.21       169
         GREEN       0.38      0.37      0.38       169
HEALTHY LIVING       0.23      0.24      0.24       168
        IMPACT       0.26      0.24      0.25       169
 LATINO VOICES       0.43      0.26      0.32       168
         MEDIA       0.43      0.38      0.40       169
       PARENTS       0.27      0.34      0.30  

In [29]:
# Update the pipeline with the best parameters
best_pipeline_lr = Pipeline([
    ('TFIDFvectorizer', TfidfVectorizer(ngram_range=(1, 1))),
    ('classifier', LogisticRegression(C=10, solver='liblinear'))
])

# Fit the best pipeline on the training data
best_pipeline_lr.fit(x_train, y_train)

# Predict on the test set
y_pred_best_lr = best_pipeline_lr.predict(x_test)

# Print the classification report for the best Logistic Regression model
print(classification_report(y_test, y_pred_best_lr, zero_division=0))

                precision    recall  f1-score   support

          ARTS       0.47      0.42      0.44       169
ARTS & CULTURE       0.25      0.22      0.24       169
  BLACK VOICES       0.29      0.22      0.25       169
      BUSINESS       0.33      0.34      0.33       169
       COLLEGE       0.41      0.42      0.41       168
        COMEDY       0.15      0.15      0.15       169
         CRIME       0.33      0.42      0.37       169
     EDUCATION       0.59      0.54      0.57       168
 ENTERTAINMENT       0.15      0.14      0.14       169
         FIFTY       0.41      0.34      0.37       169
     GOOD NEWS       0.15      0.18      0.16       169
         GREEN       0.35      0.37      0.36       169
HEALTHY LIVING       0.20      0.22      0.21       168
        IMPACT       0.25      0.23      0.24       169
 LATINO VOICES       0.29      0.26      0.27       168
         MEDIA       0.39      0.36      0.38       169
       PARENTS       0.27      0.30      0.29  

In [23]:
param_grid_svc = {
    'TFIDFvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__C': [0.01, 0.1, 1, 10]
    #'classifier__max_iter': [100, 200, 300]
}
pipeline_svc = Pipeline([
    ('TFIDFvectorizer', TfidfVectorizer()),
    #('scaler', StandardScaler(with_mean=False)),
    ('classifier', LinearSVC())
])



# GridSearchCV for LinearSVC
grid_search_svc = GridSearchCV(pipeline_svc, param_grid_svc, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_svc.fit(x_train, y_train)



print("\nBest parameters for Linear SVC:")
print(grid_search_svc.best_params_)
y_pred_svc = grid_search_svc.predict(x_test)
print(classification_report(y_test, y_pred_svc, zero_division=0))



Best parameters for Linear SVC:
{'TFIDFvectorizer__ngram_range': (1, 1), 'classifier__C': 0.1}
                precision    recall  f1-score   support

          ARTS       0.42      0.49      0.45       169
ARTS & CULTURE       0.28      0.18      0.22       169
  BLACK VOICES       0.40      0.25      0.31       169
      BUSINESS       0.38      0.40      0.39       169
       COLLEGE       0.43      0.43      0.43       168
        COMEDY       0.21      0.12      0.16       169
         CRIME       0.33      0.52      0.40       169
     EDUCATION       0.54      0.64      0.58       168
 ENTERTAINMENT       0.16      0.12      0.14       169
         FIFTY       0.32      0.40      0.35       169
     GOOD NEWS       0.22      0.21      0.21       169
         GREEN       0.37      0.46      0.41       169
HEALTHY LIVING       0.24      0.25      0.24       168
        IMPACT       0.32      0.22      0.26       169
 LATINO VOICES       0.45      0.30      0.36       168
       

In [27]:
# Update the pipeline with the best parameters
best_pipeline_svc = Pipeline([
    ('TFIDFvectorizer', TfidfVectorizer(ngram_range=(1, 2))),
    ('classifier', LinearSVC(C=0.1))
])

# Fit the best pipeline on the training data
best_pipeline_svc.fit(x_train, y_train)

# Predict on the test set
y_pred_best_svc = best_pipeline_svc.predict(x_test)

# Print the classification report for the best model
print(classification_report(y_test, y_pred_best_svc, zero_division=0))


                precision    recall  f1-score   support

          ARTS       0.40      0.54      0.46       169
ARTS & CULTURE       0.25      0.12      0.16       169
  BLACK VOICES       0.40      0.22      0.28       169
      BUSINESS       0.36      0.40      0.38       169
       COLLEGE       0.41      0.42      0.42       168
        COMEDY       0.23      0.12      0.16       169
         CRIME       0.33      0.54      0.41       169
     EDUCATION       0.50      0.62      0.56       168
 ENTERTAINMENT       0.17      0.11      0.14       169
         FIFTY       0.24      0.46      0.32       169
     GOOD NEWS       0.22      0.17      0.19       169
         GREEN       0.37      0.43      0.40       169
HEALTHY LIVING       0.26      0.25      0.26       168
        IMPACT       0.27      0.20      0.23       169
 LATINO VOICES       0.49      0.29      0.36       168
         MEDIA       0.41      0.43      0.42       169
       PARENTS       0.28      0.36      0.31  

In [28]:
# Update the pipeline with the best parameters
best_pipeline_svc = Pipeline([
    ('TFIDFvectorizer', TfidfVectorizer(ngram_range=(1, 3))),
    ('classifier', LinearSVC(C=0.1))
])

# Fit the best pipeline on the training data
best_pipeline_svc.fit(x_train, y_train)

# Predict on the test set
y_pred_best_svc = best_pipeline_svc.predict(x_test)

# Print the classification report for the best model
print(classification_report(y_test, y_pred_best_svc, zero_division=0))


                precision    recall  f1-score   support

          ARTS       0.39      0.52      0.45       169
ARTS & CULTURE       0.24      0.12      0.16       169
  BLACK VOICES       0.42      0.21      0.28       169
      BUSINESS       0.36      0.39      0.37       169
       COLLEGE       0.40      0.43      0.41       168
        COMEDY       0.22      0.12      0.16       169
         CRIME       0.32      0.56      0.41       169
     EDUCATION       0.48      0.62      0.54       168
 ENTERTAINMENT       0.17      0.11      0.14       169
         FIFTY       0.22      0.48      0.30       169
     GOOD NEWS       0.22      0.18      0.20       169
         GREEN       0.37      0.40      0.39       169
HEALTHY LIVING       0.26      0.24      0.25       168
        IMPACT       0.25      0.18      0.21       169
 LATINO VOICES       0.47      0.25      0.33       168
         MEDIA       0.40      0.40      0.40       169
       PARENTS       0.27      0.36      0.31  

In [32]:
from gensim.models import Word2Vec
import numpy as np

# Assume `x_train` and `x_test` are lists of tokenized sentences (e.g. after splitting by whitespace)

# Train Skip-gram model
skipgram_model = Word2Vec(x_train, vector_size=100, window=5, min_count=1, sg=1)

# Function to convert a sentence into average word vectors
def sentence_to_avg_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Convert train and test sentences to vectors
x_train_vec = np.array([sentence_to_avg_vector(sentence, skipgram_model) for sentence in x_train])
x_test_vec = np.array([sentence_to_avg_vector(sentence, skipgram_model) for sentence in x_test])

# Now you can use x_train_vec and x_test_vec with any of the classifiers, for example:
clf = LogisticRegression()
clf.fit(x_train_vec, y_train)
y_pred = clf.predict(x_test_vec)

# Print classification report
print(classification_report(y_test, y_pred))


                precision    recall  f1-score   support

          ARTS       0.00      0.00      0.00       169
ARTS & CULTURE       0.04      0.01      0.02       169
  BLACK VOICES       0.17      0.01      0.01       169
      BUSINESS       0.00      0.00      0.00       169
       COLLEGE       0.07      0.06      0.07       168
        COMEDY       0.07      0.11      0.08       169
         CRIME       0.10      0.01      0.02       169
     EDUCATION       0.04      0.01      0.01       168
 ENTERTAINMENT       0.13      0.01      0.02       169
         FIFTY       0.10      0.22      0.14       169
     GOOD NEWS       0.08      0.09      0.08       169
         GREEN       0.00      0.00      0.00       169
HEALTHY LIVING       0.04      0.15      0.06       168
        IMPACT       0.00      0.00      0.00       169
 LATINO VOICES       0.07      0.02      0.03       168
         MEDIA       0.06      0.05      0.05       169
       PARENTS       0.08      0.02      0.03  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
pip install gensim

Collecting gensim
  Using cached gensim-4.3.3-cp312-cp312-win_amd64.whl.metadata (8.2 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   -- ------------------------------------- 1.6/24.0 MB 4.6 MB/s eta 0:00:05
   ---- ----------------------------------- 2.9/24.0 MB 4.8 MB/s eta 0:00:05
   ------ --------------------------------- 4.2/24.0 MB 4.8 MB/s eta 0:00:05
   ---------- ----------------------------- 6.0/24.0 MB 4.9 MB/s eta 0:00:04
   ------------ --------------------------- 7.6/24.0 MB 5.0 MB/s eta 0:00:04
   -------------- ------------------------- 8.7/24.0 MB 5.0 MB/s eta 0:00:04
   ----------------- ---------------------- 10.2/24.0 MB 5.1 MB/s eta 0:00:03
   ------------------ --------------------- 11.3/24.0 MB 5.2 MB/s eta 0:00:03
   -------------------- ------------------- 

  You can safely remove it manually.
  You can safely remove it manually.
