In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None  # default='warn'


f="8606 db for prelabelling  - db.csv"


df1=pd.read_csv(f,skiprows=11)

#Read the dataset, skip first 11 rows as was empty space



In [2]:
df=df1[["Question","Answer","Final"]]

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8479 entries, 0 to 8478
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  8479 non-null   object
 1   Answer    8477 non-null   object
 2   Final     8479 non-null   object
dtypes: object(3)
memory usage: 198.9+ KB


In [3]:
# removing tie labels as it is unknown, and will affect model performance for now
df = df[df['Final'] != 'Tie']

# Print the resulting DataFrame

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6631 entries, 0 to 8477
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  6631 non-null   object
 1   Answer    6629 non-null   object
 2   Final     6631 non-null   object
dtypes: object(3)
memory usage: 207.2+ KB


In [5]:
df2=df.dropna() # Drop Nans rows

In [6]:
#In this appraoch, will combine question and answer into one column. 
#will experiment with both the columns as seperate features and so on in future iterations

df2['joined_column'] = df['Question'] + ' ' + df['Answer'] 

In [7]:
df3=df2[["joined_column","Final"]]

In [8]:
df3["Final"].value_counts()

Final
analysis                 2911
science and tech         2025
strategy                  725
factual                   724
management                 84
taxonomy                   80
ethics and regulation      52
Science and Tech            5
incomplete Q&A              5
Taxonomy                    3
analysis                    3
Analysis                    3
science and tech            2
Management                  2
Factual                     2
factual                     1
incomplete                  1
taxonomy                    1
Name: count, dtype: int64

In [9]:
df4=df3[(df3["Final"] == 'strategy') | (df3["Final"] == 'science and tech') | (df3["Final"] == 'analysis')
       | (df3["Final"] == 'factual') | (df3["Final"] == 'taxonomy') | (df3["Final"] == 'management')
       | (df3["Final"] == 'ethics and regulation')] # only consider the main labels, can combine the mispelled into parent 
                                                    # spelling label. ALSO, data is unbalenced for ethics,taxonomy and managment mainly

In [10]:
len(df4[df4["Final"]=="taxonomy"])

80

In [11]:
import re

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer() # lemma




corpus = []


text = list(df4['joined_column']) # for cleaning purposes

#below is following:
#Remove all special characters
#Lowercase all the words
#Tokenize
#Remove stopwords
#Lemmatize

for i in range(len(text)):

    r = re.sub('[^a-zA-Z]', ' ', text[i]) 

    r = r.lower()

    r = r.split()

    r = [word for word in r if word not in stopwords.words('english')]

    r = [lemmatizer.lemmatize(word) for word in r]

    r = ' '.join(r)

    corpus.append(r)




#assign corpus to data['text']

df4['joined_column'] = corpus

df4.head()

Unnamed: 0,joined_column,Final
0,source atmospheric nutrient tianchi lake prima...,analysis
1,fertilization effect phytoplankton expected ch...,analysis
2,rising temperature affect alpine lake rising t...,science and tech
3,increase phytoplankton biomass observed alpine...,analysis
4,higher metabolic rate organism longer growing ...,science and tech


In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report,make_scorer
from sklearn.preprocessing import LabelEncoder


# Preprocessing steps
X = df4['joined_column']
y = df4['Final']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

# Define classifiers and their hyperparameter grids
classifiers = {
    'Logistic Regression': (LogisticRegression(max_iter=7600), {'C': [0.1, 1.0, 10.0]}),
    'K-Nearest Neighbors': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
    'Decision Tree': (DecisionTreeClassifier(), {'max_depth': [None, 10, 20]}),
    'Random Forest': (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
    'Support Vector Machine': (SVC(), {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf']})
}

# Define evaluation metrics
scoring = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, average='weighted'),
    'Recall': make_scorer(recall_score, average='weighted'),
    'F1': make_scorer(f1_score, average='weighted')
}

# Perform grid search for each classifier
results = {}

for classifier_name, (classifier, param_grid) in classifiers.items():
    grid_search = GridSearchCV(classifier, param_grid, scoring=scoring, refit='F1', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    y_pred = grid_search.predict(X_test)

    # Store results
    results[classifier_name] = {
        'Best Parameters': grid_search.best_params_,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1': f1_score(y_test, y_pred, average='weighted', zero_division=0),
        'Classification Report': classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0)
    }

# Display results
for classifier_name, metrics in results.items():
    print(f'Classifier: {classifier_name}')
    print(f'Best Parameters: {metrics["Best Parameters"]}')
    print(f'Accuracy: {metrics["Accuracy"]:.4f}')
    print(f'Precision: {metrics["Precision"]:.4f}')
    print(f'Recall: {metrics["Recall"]:.4f}')
    print(f'F1-score: {metrics["F1"]:.4f}')
    print(f'Classification Report:\n{metrics["Classification Report"]}')
    print('\n')
    print('\n')

Classifier: Logistic Regression
Best Parameters: {'C': 1.0}
Accuracy: 0.7320
Precision: 0.7256
Recall: 0.7320
F1-score: 0.7194
Classification Report:
                       precision    recall  f1-score   support

             analysis       0.74      0.86      0.79       608
ethics and regulation       0.00      0.00      0.00         7
              factual       0.86      0.63      0.73       152
           management       1.00      0.19      0.32        16
     science and tech       0.68      0.73      0.70       381
             strategy       0.71      0.50      0.59       141
             taxonomy       0.00      0.00      0.00        16

             accuracy                           0.73      1321
            macro avg       0.57      0.41      0.45      1321
         weighted avg       0.73      0.73      0.72      1321





Classifier: K-Nearest Neighbors
Best Parameters: {'n_neighbors': 7}
Accuracy: 0.6048
Precision: 0.6392
Recall: 0.6048
F1-score: 0.5614
Classification 


#This was the dataset labels size for this approach 

analysis                 2911

science and tech         2025

strategy                  725

factual                   724

management                 84

taxonomy                   80

ethics and regulation      52

Key points:

- Tie labels were not considered for this approach but can be intergrating in future appraoches with relavant solutions




- Taxonomy, Management and ethics and regulation had low precision and recall scores. This is due to them having less
  labeled data. 
  
  
- analysis, factual, science and tech are getting decent results but strategy could be better.


- Need to integrate data imbalenced approaches to the models


- Code will need to be revised aswell in future iterations


###  This is an initial baseline draft version, can build and improve this further by trying different pre-processing, cleaning, features, models and so on.