In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier


In [3]:
# Data import
data = pd.read_csv('Data\dataset_for_usage.csv')

In [4]:
unique_labels = data['Source'].unique()
print("Unique Labels:", unique_labels)
for label in unique_labels:
    na_count = data[data['Source'] == label].isna().sum()
    print(f"Missing values for {label}:", na_count)

Unique Labels: ['abstract' 'article' 'blog' 'movie' 'reddit' 'twitter' 'legal']
Missing values for abstract: Unnamed: 0    0
text          0
Source        0
dtype: int64
Missing values for article: Unnamed: 0    0
text          0
Source        0
dtype: int64
Missing values for blog: Unnamed: 0    0
text          0
Source        0
dtype: int64
Missing values for movie: Unnamed: 0    0
text          0
Source        0
dtype: int64
Missing values for reddit: Unnamed: 0    0
text          0
Source        0
dtype: int64
Missing values for twitter: Unnamed: 0    0
text          0
Source        0
dtype: int64
Missing values for legal: Unnamed: 0    0
text          0
Source        0
dtype: int64


## Processing

## Model training and evaluation

In [5]:

def preprocess_data(data):
    # Handle missing values in text data
    data['text'].fillna("", inplace=True)

    # Perform stemming using PorterStemmer
    stemmer = PorterStemmer()
    data['text'] = data['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

    text_data = data['text'].tolist()
    target = data['Source']

    return text_data, target

# Load and preprocess the data
text_data, target = preprocess_data(data)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_data, target, test_size=0.2, random_state=42)


# Modelling

In [6]:
# Define the models
models = [
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('SVM', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Logistic Regression', LogisticRegression())
]

### Defining pipeline

In [7]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', models)
])
pipeline

### Train the models

In [9]:

# Train the individual models
for name, model in models:
    print("Training:", name)
    pipeline.set_params(model=model)
    pipeline.fit(X_train, y_train)
    accuracy = pipeline.score(X_test, y_test)
    print("Accuracy:", accuracy)

Training: Multinomial Naive Bayes
Accuracy: 0.7150192554557124
Training: SVM
Accuracy: 0.9486521181001284
Training: Random Forest
Accuracy: 0.9328198545143346
Training: Gradient Boosting
Accuracy: 0.9512195121951219
Training: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9469405220367993


### Trying out an ensemble model

In [10]:
# Create the list of estimators for the ensemble model
estimators = models
# Define the ensemble model
ensemble_model = VotingClassifier(estimators)
# Define the pipeline
pipeline_02 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', ensemble_model)
])
pipeline_02


In [11]:
# Train the ensemble model
pipeline_02.fit(X_train, y_train)
# Evaluate the ensemble model
accuracy_02 = pipeline_02.score(X_test, y_test)
print("Accuracy:", accuracy_02)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9544287548138639


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    confusion_matrix, classification_report, roc_curve, roc_auc_score, \
    precision_recall_curve, average_precision_score, cohen_kappa_score

# Evaluate the ensemble model
y_pred_02 = pipeline_02.predict(X_test)
# Accuracy
accuracy_ensemble = accuracy_score(y_test, y_pred_02)
# Precision
precision_ensemble = precision_score(y_test, y_pred_02, average='macro')
# Recall
recall_ensemble = recall_score(y_test, y_pred_02, average='macro')
# F1-Score
f1_ensemble = f1_score(y_test, y_pred_02, average='macro')
# Confusion Matrix
confusion_mat_ensemble = confusion_matrix(y_test, y_pred_02)
# Classification Report
classification_rep_ensemble = classification_report(y_test, y_pred_02)




# Cohen's Kappa
cohen_kappa = cohen_kappa_score(y_test, y_pred_02)

print("Accuracy:", accuracy_ensemble)
print("Precision:", precision_ensemble)
print("Recall:", recall_ensemble)
print("F1-Score:", f1_ensemble)
print("Confusion Matrix:\n", confusion_mat_ensemble)
print("Classification Report:\n", classification_rep_ensemble)
print("Cohen's Kappa:", cohen_kappa)


Accuracy: 0.9544287548138639
Precision: 0.9553332179236041
Recall: 0.954656299766363
F1-Score: 0.9548521518214537
Confusion Matrix:
 [[643   0   0   0   0   0   0]
 [  0 681  26   0   0   0   0]
 [  0  34 617   0   0  33   0]
 [  0   0   0 654   0   0   0]
 [  0   0   1   0 667   1   0]
 [  0   0   5   0   0 639  55]
 [  0   0   2   0   0  56 560]]
Classification Report:
               precision    recall  f1-score   support

    abstract       1.00      1.00      1.00       643
     article       0.95      0.96      0.96       707
        blog       0.95      0.90      0.92       684
       legal       1.00      1.00      1.00       654
       movie       1.00      1.00      1.00       669
      reddit       0.88      0.91      0.89       699
     twitter       0.91      0.91      0.91       618

    accuracy                           0.95      4674
   macro avg       0.96      0.95      0.95      4674
weighted avg       0.95      0.95      0.95      4674

Cohen's Kappa: 0.94681388819

In [13]:
new_text = """The development of light detection and ranging, Radar, camera, and other advanced sensor technologies inaugurated a new era in autonomous driving. However, due to the intrinsic limitations of these sensors, autonomous vehicles are prone to making erroneous decisions and causing serious disasters. At this point, networking and communication technologies can greatly make up for sensor deficiencies, and are more reliable, feasible and efficient to promote the information interaction, thereby improving autonomous vehicle's perception and planning capabilities as well as realizing better vehicle control. This paper surveys the networking and communication technologies in autonomous driving from two aspects: intra- and inter-vehicle. The intra-vehicle network as the basis of realizing autonomous driving connects the on-board electronic parts. The inter-vehicle network is the medium for interaction between vehicles and outside information. In addition, we present the new trends of communication technologies in autonomous driving, as well as investigate the current mainstream verification methods and emphasize the challenges and open issues of networking and communications in autonomous driving."""

predicted_label = pipeline_02.predict([new_text])

print("Predicted label:", predicted_label)



Predicted label: ['abstract']
