In [None]:
# Essential imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Manually annoted sample dataset for classification
# Load the dataset
file_path = 'label_data_1.csv'
data = pd.read_csv(file_path, encoding='latin1', delimiter=',', usecols=['condition', 'Text', 'Tag', 'Word', 'sentence_number'])


In [None]:
# Preprocess the data
label_data = pd.read_csv(file_path, encoding='ISO-8859-1', delimiter=',')
unique_label_data = label_data[['Text', 'condition']].drop_duplicates()
unique_label_data.dropna(subset=['condition'], inplace=True)
unique_label_data['condition'] = unique_label_data['condition'].astype(int)
unique_label_data['Text'] = unique_label_data['Text'].str.replace('[^a-zA-Z0-9 ]', ' ').str.lower()

In [None]:
# Display data information
print('Data Head:', data.head())
print('Data datatype:', data.dtypes)
print('Data Shape:', data.shape)
print(unique_label_data['condition'].value_counts())

In [None]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(unique_label_data['Text'])
y = unique_label_data['condition']


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


## Naive Bayes

In [None]:
# Naive Bayes model
mnb = MultinomialNB().fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_mnb))

## Logistic regression

In [None]:
# Logistic Regression model
lr = LogisticRegression(max_iter=2000, penalty='l2')
lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, predictions_lr))

## SVM for classification

In [None]:
# SVM model
svm_model = SVC(kernel='linear', gamma=0.1, C=1)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

In [None]:
# Apply the SVM model to new data and save the results
data_new_path = 'Data.csv'  # Update the path if needed

data_new = pd.read_csv(data_new_path, encoding='utf-8', delimiter=',')
# data_new['Text'] = data_new['Text'].str.replace('[^a-zA-Z0-9 ]', ' ').str.lower()
X_new = tfidf_vectorizer.transform(data_new['Text'])
predictions_new = svm_model.predict(X_new)
data_new['Predicted_Condition'] = predictions_new
traffic_related_data_new = data_new[data_new['Predicted_Condition'] == 1]


In [None]:
# Save the results to a new CSV file
output_new_data_file_path = 'Classified_data.csv'  # Update the path if needed
traffic_related_data_new.to_csv(output_new_data_file_path, index=False)

print(f"Output CSV file created: {output_new_data_file_path}")

In [None]:
new = pd.read_csv('Classified_data.csv')
new.head(10)

In [None]:
new.shape