In [1]:
import warnings
import pandas as pd
import numpy as no
# Filter out DeprecationWarnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None  # default='warn'

# below is loading dataset. put own path if csv file is not in same folder

f="8606 db for prelabelling  - db.csv"


df1=pd.read_csv(f,skiprows=11)


In [2]:
df=df1[["Question","Answer","Final"]] # consider question, answer and final label columns only 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8479 entries, 0 to 8478
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  8479 non-null   object
 1   Answer    8477 non-null   object
 2   Final     8479 non-null   object
dtypes: object(3)
memory usage: 198.9+ KB


In [3]:
df["Final"].value_counts() # Will remove Tie and add the labels into one final label

Final
analysis                 2911
science and tech         2025
Tie                      1848
factual                   726
strategy                  725
management                 84
taxonomy                   80
ethics and regulation      52
Science and Tech            5
incomplete Q&A              5
analysis                    3
Analysis                    3
Taxonomy                    3
science and tech            2
Management                  2
Factual                     2
taxonomy                    1
factual                     1
incomplete                  1
Name: count, dtype: int64

In [4]:
# Remove rows where 'text1' column contains the specified value
df = df[df['Final'] != 'Tie']

# Print the resulting DataFrame

In [5]:
df2=df.dropna() # Drop Nans rows

In [6]:
df2['joined_column'] = df2['Question'] + ' ' + df2['Answer'] # combine question and answer into one

In [7]:
df3=df2[["joined_column","Final"]]

In [8]:
df3["Final"].value_counts() # final count of labels

Final
analysis                 2911
science and tech         2025
strategy                  725
factual                   724
management                 84
taxonomy                   80
ethics and regulation      52
Science and Tech            5
incomplete Q&A              5
Taxonomy                    3
analysis                    3
Analysis                    3
science and tech            2
Management                  2
Factual                     2
factual                     1
incomplete                  1
taxonomy                    1
Name: count, dtype: int64

In [9]:
df4=df3[(df3["Final"] == 'strategy') | (df3["Final"] == 'science and tech') | (df3["Final"] == 'analysis')
       | (df3["Final"] == 'factual') | (df3["Final"] == 'taxonomy') | (df3["Final"] == 'management')
       | (df3["Final"] == 'ethics and regulation')] # consider only these labels , can correct other spelling mistakes 
                                                    # into parent one 

In [10]:
# create a list text

text = list(df4['joined_column'])




# preprocessing loop

import re

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()




corpus = []


#Remove all special characters
#Lowercase all the words
#Tokenize
#Remove stopwords
#Lemmatize

for i in range(len(text)):

    r = re.sub('[^a-zA-Z]', ' ', text[i])

    r = r.lower()

    r = r.split()

    r = [word for word in r if word not in stopwords.words('english')]

    r = [lemmatizer.lemmatize(word) for word in r]

    r = ' '.join(r)

    corpus.append(r)




#assign corpus to data['text']

df4['text'] = corpus

df4.head()

Unnamed: 0,joined_column,Final,text
0,What were the sources of atmospheric nutrients...,analysis,source atmospheric nutrient tianchi lake prima...
1,How was the fertilization effect on phytoplank...,analysis,fertilization effect phytoplankton expected ch...
2,How do rising temperatures affect Alpine lakes...,science and tech,rising temperature affect alpine lake rising t...
3,How has an increase in phytoplankton biomass b...,analysis,increase phytoplankton biomass observed alpine...
4,How do higher metabolic rates of organisms and...,science and tech,higher metabolic rate organism longer growing ...


In [11]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Sample data for class labels
y = ['analysis', 'ethics and regulation', 'factual', 'management', 'science and tech', 'strategy', 'taxonomy']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder on your class labels and transform them into integer labels
y_encoded = label_encoder.fit_transform(df4["Final"])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df4["text"], y_encoded, test_size=0.2, random_state=42)
# split train and test.

In [13]:
# Convert text data to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [15]:
# Apply oversampling to balance the training data
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_tfidf, y_train)



In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report,make_scorer
from sklearn.preprocessing import LabelEncoder


# setting class_weight balenced helps with data imbalenced issues 

random_forest = RandomForestClassifier(class_weight='balanced')

logistic_regression1 = LogisticRegression(class_weight='balanced',max_iter=7600)
xgboost_classifier1 = XGBClassifier(n_estimators=300)

# Create a voting classifier with 'soft' voting
voting_classifier = VotingClassifier(
    estimators=[
        ('lr', logistic_regression1),
        ('xg', xgboost_classifier1),
        ('dt', random_forest),
       
      
        
    ],
    voting='soft'  # Use 'soft' voting for probability-based decision
)

# Train the voting classifier
voting_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred1 = voting_classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy1 = accuracy_score(y_test, y_pred1)
print("Voting Classifier Accuracy:", accuracy1)

# Evaluate the models
print("Logistic Regression:")
print(classification_report(y_test, y_pred1))



Voting Classifier Accuracy: 0.74110522331567
Logistic Regression:
              precision    recall  f1-score   support

           0       0.78      0.81      0.79       608
           1       0.50      0.29      0.36         7
           2       0.80      0.70      0.75       152
           3       0.50      0.19      0.27        16
           4       0.71      0.74      0.72       381
           5       0.62      0.62      0.62       141
           6       0.83      0.31      0.45        16

    accuracy                           0.74      1321
   macro avg       0.68      0.52      0.57      1321
weighted avg       0.74      0.74      0.74      1321



- RandomOverSampler and weight="balenced" helps improve the data imbalenced issue metrics by a bit. This shows these methods should be considered and used to counter the data imbalence issue in the dataset.
- Also there are other data imbalenced methods that can be explored aswell

- the less imbalenced dataset issues metrics seem better than previous attempt overall when randomoversampler was not used as it takes into account data imbalence when training the models 