In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, GlobalMaxPooling1D
from gensim.models import KeyedVectors
from keras.optimizers import Adam
from keras.optimizers import Adam
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score


In [17]:
# Data import
data = pd.read_csv('Data/balenced_data.csv')

In [19]:
unique_labels = data['Source'].unique()
print("Unique Labels:", unique_labels)
for label in unique_labels:
    na_count = data[data['Source'] == label].isna().sum()
    print(f"Missing values for {label}:", na_count)

Unique Labels: ['abstract' 'article' 'blog' 'movie' 'reddit' 'song' 'twitter']
Missing values for abstract: text      0
Source    0
dtype: int64
Missing values for article: text      0
Source    0
dtype: int64
Missing values for blog: text      0
Source    0
dtype: int64
Missing values for movie: text      585
Source      0
dtype: int64
Missing values for reddit: text      11
Source     0
dtype: int64
Missing values for song: text      0
Source    0
dtype: int64
Missing values for twitter: text      0
Source    0
dtype: int64


## Processing

In [103]:
def preprocess_data(data):
    # Handle missing values in text data
    data['text'].fillna("", inplace=True)

    text_data = data['text'].tolist()
    target = data['Source']

    # Stemming
    stemmer = PorterStemmer()
    stemmed_text = [' '.join([stemmer.stem(word) for word in text.split()]) for text in text_data]

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(stemmed_text)

    # Step 3: Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test, vectorizer



## Model training and evaluation

In [104]:
X_train, X_test, y_train, y_test, vectorizer = preprocess_data(data)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(21968, 225833) (21968,)
(5493, 225833) (5493,)


### MultinominalNB als Model

In [92]:
# Create a Multinomial Naive Bayes classifier
classifiermultib = MultinomialNB()
# Train the classifier
classifiermultib.fit(X_train, y_train)
# Example usage
# Make predictions on the test data
y_pred_multib = classifiermultib.predict(X_test)
# Evaluate the classifier
precision_multib = precision_score(y_test, y_pred_multib, average='weighted')
report_multib = classification_report(y_test, y_pred_multib)
print("Precision:", precision_multib)
print("Classification Report:")
print(report_multib)


Precision: 0.7302249715975466
Classification Report:
              precision    recall  f1-score   support

    abstract       0.99      1.00      0.99       777
     article       0.63      0.93      0.75       790
        blog       0.86      0.22      0.35       811
       movie       0.33      0.85      0.48       805
      reddit       0.52      0.23      0.32       759
        song       0.91      0.33      0.48       769
     twitter       0.88      0.73      0.80       782

    accuracy                           0.61      5493
   macro avg       0.73      0.61      0.60      5493
weighted avg       0.73      0.61      0.60      5493



### Support Vector Machines (SVM)

In [93]:
# Create an SVM classifier
classifier_SVM = SVC()
# Train the classifier
classifier_SVM.fit(X_train, y_train)
# Make predictions on the test data
y_pred_SVM = classifier_SVM.predict(X_test)
# Evaluate the classifier
precision_SVM = precision_score(y_test, y_pred_SVM, average='weighted')
report_SVM = classification_report(y_test, y_pred_SVM)
print("Precision:", precision_SVM)
print("Classification Report:")
print(report_SVM)



Precision: 0.9306422565987887
Classification Report:
              precision    recall  f1-score   support

    abstract       1.00      1.00      1.00       777
     article       0.95      0.95      0.95       790
        blog       0.91      0.84      0.87       811
       movie       0.98      0.99      0.99       805
      reddit       0.78      0.92      0.85       759
        song       0.95      0.89      0.92       769
     twitter       0.94      0.89      0.91       782

    accuracy                           0.93      5493
   macro avg       0.93      0.93      0.93      5493
weighted avg       0.93      0.93      0.93      5493



### Random Forest

In [99]:
# Create a Random Forest classifier
classifierRF = RandomForestClassifier()
# Train the classifier
classifierRF.fit(X_train, y_train)
# Make predictions on the test data
y_pred_RF = classifierRF.predict(X_test)
# Evaluate the classifier
precision_RF = precision_score(y_test, y_pred_RF, average='weighted')
report_RF = classification_report(y_test, y_pred_RF)
print("Precision:", precision_RF)
print("Classification Report:")
print(report_RF)


Precision: 0.905625941361713
Classification Report:
              precision    recall  f1-score   support

    abstract       1.00      1.00      1.00       777
     article       0.91      0.95      0.93       790
        blog       0.85      0.83      0.84       811
       movie       0.87      1.00      0.93       805
      reddit       0.85      0.78      0.81       759
        song       0.95      0.85      0.90       769
     twitter       0.91      0.92      0.92       782

    accuracy                           0.90      5493
   macro avg       0.91      0.90      0.90      5493
weighted avg       0.91      0.90      0.90      5493



### Gradient Boosting

In [None]:
# Create a Gradient Boosting classifier
classifierGB = GradientBoostingClassifier()
# Train the classifier
classifierGB.fit(X_train, y_train)
# Make predictions on the test data
y_pred_GB = classifierGB.predict(X_test)
# Evaluate the classifier
precision_GB = precision_score(y_test, y_pred_GB, average='weighted')
report_GB = classification_report(y_test, y_pred_GB)
print("Precision:", precision_GB)
print("Classification Report:")
print(report_GB)


### Logistic Regression

In [74]:
# Step 4: Model selection and training
classifierLR = LogisticRegression()
classifierLR.fit(X_train, y_train)

# Step 5: Model evaluation
y_pred_LR = classifierLR.predict(X_test)
precision_LR = precision_score(y_test, y_pred_LR, average='weighted')
report_LR = classification_report(y_test, y_pred_LR)
print("Precision:", precision_LR)
print("Classification Report:")
print(report_LR)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Precision: 0.9154214893941176
Classification Report:
              precision    recall  f1-score   support

    abstract       1.00      1.00      1.00       777
     article       0.94      0.96      0.95       790
        blog       0.91      0.85      0.88       811
       movie       0.96      0.84      0.90       805
      reddit       0.70      0.93      0.80       759
        song       0.95      0.87      0.91       769
     twitter       0.94      0.88      0.91       782

    accuracy                           0.90      5493
   macro avg       0.91      0.91      0.91      5493
weighted avg       0.92      0.90      0.91      5493



## Ensambling method

In [None]:
# Create individual models
model1 = MultinomialNB()
model2 = RandomForestClassifier()
model3 = GradientBoostingClassifier()
model4 = SVC(probability=True)  # SVM model


# Create a voting classifier
voting_classifier = VotingClassifier(estimators=[
    ('nb', model1),
    ('rf', model2),
    ('gb', model3),
    ('svm', model4),
    ('nn', model5)
], voting='soft')

# Train the voting classifier
voting_classifier.fit(X_train, y_train)
# Make predictions on the test data
y_pred = voting_classifier.predict(X_test)

# Evaluate the voting classifier
report = classification_report(y_test, y_pred)
print(report)


### Test 3.0 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from nltk.stem import PorterStemmer

def preprocess_data(data):
    # Handle missing values in text data
    data['text'].fillna("", inplace=True)

    # Perform stemming using PorterStemmer
    stemmer = PorterStemmer()
    data['text'] = data['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

    text_data = data['text'].tolist()
    target = data['Source']

    return text_data, target

# Load and preprocess the data
text_data, target = preprocess_data(data)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_data, target, test_size=0.2, random_state=42)


In [13]:
# Define the models
models = [
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('SVM', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Logistic Regression', LogisticRegression())
]

In [15]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', models)
])
pipeline

In [None]:

# Train the individual models
for name, model in models:
    print("Training:", name)
    pipeline.set_params(model=model)
    pipeline.fit(X_train, y_train)
    accuracy = pipeline.score(X_test, y_test)
    print("Accuracy:", accuracy)