In [61]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, GlobalMaxPooling1D
from gensim.models import KeyedVectors
from keras.optimizers import Adam
from keras.optimizers import Adam
from sklearn.svm import SVC
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score


In [62]:
# Data import
data = pd.read_csv('Data/balenced_data.csv')

## Tokenization and stemming

In [63]:
# Preprocess function to clean and preprocess text

def preprocess_text(text):
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenization
    tokens = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Join tokens back into a single string
    processed_text = ' '.join(tokens)
    return processed_text
# Example usage
# text = "This is an example sentence."
# processed_text = preprocess_text(text)
# print(processed_text)


## Vectorization

In [11]:
# The following code will cause memory error: 
# Unable to allocate 45.7 GiB for an array with shape (26865, 228492) and data type float64

# Drop rows with missing values in the 'text' column
# data = data.dropna(subset=['text'])
# Create a TF-IDF vectorizer
# vectorizer = TfidfVectorizer()
# Fit and transform the preprocessed text data
# features = vectorizer.fit_transform(data['text'])
# Convert features to a dense matrix
# features = features.todense()
# Example usage
# print(features.shape)


Due to the above mentioned memory problem, we manually limited the maximum features extracted. Limiting the number of features can potentially impact the model's performance, as it reduces the amount of information available for training.
By setting a maximum number of features, we are essentially reducing the dimensionality of the feature space. This can help mitigate memory constraints and improve computational efficiency. However, it also means that some potentially relevant features may be discarded, which can result in a loss of information. It's important to strike a balance between reducing dimensionality for efficiency purposes and retaining enough informative features for effective model training. The optimal number of features may vary depending on the specific dataset and problem domain, so it's worth experimenting with different feature subset sizes to find the most suitable configuration.

In [65]:
data_withoutna = data.dropna(subset=['text'])
# Shuffle the data to get a random subset
shuffled_data = shuffle(data_withoutna, random_state=42)
# Set the maximum number of features and the sample size
max_features = 100000
sample_size = 26865
# Create a TF-IDF vectorizer with limited features
vectorizer = TfidfVectorizer(max_features=max_features)
# Sample a smaller subset of the data for feature extraction
sampled_data = shuffled_data.sample(n=sample_size, random_state=42)
# Fit and transform the preprocessed text data
features = vectorizer.fit_transform(sampled_data['text'])
# Example usage
print(features.shape)


(26865, 100000)


## Model training and evaluation

In [68]:
# Split the features and labels into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, data_withoutna['Source'], test_size=0.2, random_state=42)
# Example usage
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(21492, 100000) (21492,)
(5373, 100000) (5373,)


### MultinominalNB als Model

In [75]:
# Create a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
# Train the classifier
classifier.fit(X_train, y_train)
# Example usage
# Make predictions on the test data
y_pred = classifier.predict(X_test)
# Evaluate the classifier
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

    abstract       0.99      1.00      0.99       777
     article       0.66      0.92      0.77       790
        blog       0.86      0.22      0.35       811
       movie       0.32      0.85      0.46       805
      reddit       0.51      0.22      0.30       759
        song       0.91      0.32      0.47       769
     twitter       0.88      0.73      0.80       782

    accuracy                           0.61      5493
   macro avg       0.73      0.61      0.59      5493
weighted avg       0.73      0.61      0.59      5493



### Support Vector Machines (SVM)

In [36]:
# Create an SVM classifier
classifier = SVC()
# Train the classifier
classifier.fit(X_train, y_train)
# Make predictions on the test data
y_pred = classifier.predict(X_test)
# Evaluate the classifier
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

    abstract       0.14      0.18      0.16       792
     article       0.15      0.14      0.15       801
        blog       0.14      0.18      0.16       775
       movie       0.15      0.01      0.02       671
      reddit       0.13      0.15      0.14       768
        song       0.16      0.18      0.17       786
     twitter       0.14      0.14      0.14       780

    accuracy                           0.14      5373
   macro avg       0.14      0.14      0.13      5373
weighted avg       0.14      0.14      0.14      5373



### Random Forest

In [39]:
# Create a Random Forest classifier
classifier = RandomForestClassifier()
# Train the classifier
classifier.fit(X_train, y_train)
# Make predictions on the test data
y_pred = classifier.predict(X_test)
# Evaluate the classifier
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

    abstract       0.14      0.16      0.15       792
     article       0.15      0.16      0.16       801
        blog       0.14      0.15      0.14       775
       movie       0.10      0.05      0.06       671
      reddit       0.14      0.14      0.14       768
        song       0.15      0.18      0.16       786
     twitter       0.16      0.16      0.16       780

    accuracy                           0.14      5373
   macro avg       0.14      0.14      0.14      5373
weighted avg       0.14      0.14      0.14      5373



### Gradient Boosting

In [40]:

# Create a Gradient Boosting classifier
classifier = GradientBoostingClassifier()
# Train the classifier
classifier.fit(X_train, y_train)
# Make predictions on the test data
y_pred = classifier.predict(X_test)
# Evaluate the classifier
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

    abstract       0.15      0.13      0.14       792
     article       0.14      0.09      0.11       801
        blog       0.14      0.07      0.09       775
       movie       0.14      0.04      0.07       671
      reddit       0.14      0.50      0.22       768
        song       0.16      0.10      0.12       786
     twitter       0.17      0.08      0.11       780

    accuracy                           0.14      5373
   macro avg       0.15      0.14      0.12      5373
weighted avg       0.15      0.14      0.12      5373



### Logistic Regression

In [74]:
# Step 4: Model selection and training
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Model evaluation
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)

print("Precision:", precision)
print("Classification Report:")
print(report)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Precision: 0.9154214893941176
Classification Report:
              precision    recall  f1-score   support

    abstract       1.00      1.00      1.00       777
     article       0.94      0.96      0.95       790
        blog       0.91      0.85      0.88       811
       movie       0.96      0.84      0.90       805
      reddit       0.70      0.93      0.80       759
        song       0.95      0.87      0.91       769
     twitter       0.94      0.88      0.91       782

    accuracy                           0.90      5493
   macro avg       0.91      0.91      0.91      5493
weighted avg       0.92      0.90      0.91      5493



### Neural network

In [17]:

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.utils import to_categorical

# Calculate the input dimension based on the number of features
input_dim = X_train.shape[1]
# Define the number of classes
num_classes = 7

# Convert the target variable to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes)
y_test_encoded = to_categorical(y_test, num_classes)
# Create a sequential model
model = Sequential()
# Add a dense layer with ReLU activation
model.add(Dense(128, activation='relu', input_shape=(input_dim,)))
# Add dropout for regularization
model.add(Dropout(0.5))
# Add another dense layer with ReLU activation
model.add(Dense(64, activation='relu'))
# Add dropout for regularization
model.add(Dropout(0.5))
# Add the final dense layer with softmax activation for multi-class classification
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_encoded, batch_size=32, epochs=10, validation_data=(X_test, y_test_encoded))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test_encoded, verbose=0)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)



ValueError: invalid literal for int() with base 10: 'movie'

## Ensambling method

In [49]:
# Create individual models
model1 = MultinomialNB()
model2 = RandomForestClassifier()
model3 = GradientBoostingClassifier()
model4 = SVC(probability=True)  # SVM model

# Create a simple neural network model
model5 = Sequential()
model5.add(Dense(128, activation='relu', input_shape=(input_dim,)))
model5.add(Dropout(0.5))
model5.add(Dense(64, activation='relu'))
model5.add(Dropout(0.5))
model5.add(Dense(num_classes, activation='softmax'))
model5.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Create a voting classifier
voting_classifier = VotingClassifier(estimators=[
    ('nb', model1),
    ('rf', model2),
    ('gb', model3),
    ('svm', model4),
    ('nn', model5)
], voting='soft')

# Train the voting classifier
voting_classifier.fit(X_train, y_train)
# Make predictions on the test data
y_pred = voting_classifier.predict(X_test)

# Evaluate the voting classifier
report = classification_report(y_test, y_pred)
print(report)


NameError: name 'input_dim' is not defined

In [32]:
# Preprocess and extract features for new text
new_text = "This is a new text to classify."
processed_new_text = preprocess_text(new_text)
new_features = vectorizer.transform([processed_new_text])
# Convert new_features to a dense matrix and then to a numpy array
new_features_array = np.asarray(new_features.toarray())
# Predict the source category for the new text
predicted_category = best_classifier.predict(new_features_array)
# Example usage
print(predicted_category)


['reddit']


### Test BERT


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [33]:
# Preprocessing steps
# Lowercase conversion
data_test = data
data_test['text'] = data_test['text'].str.lower()

# Split the data into training and testing sets
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(data['text'], data['Source'], test_size=0.2, random_state=42)
y_train_new

1231     abstract
531      abstract
20161        song
6332      article
11285        blog
           ...   
21575        song
5390      article
860      abstract
15795      reddit
23654     twitter
Name: Source, Length: 21968, dtype: object

In [10]:
# Step 2: Feature engineering
# Encode the target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_new)
y_test_encoded = label_encoder.transform(y_test_new)
num_classes = len(label_encoder.classes_)
num_classes

7

In [11]:
# Tokenization and encoding using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [48]:
# Parameters
batch_size = 16
num_workers = 2
max_length = 128

# Define collate function
def collate_fn(batch):
    batch = [sample for sample in batch if sample is not None]
    return torch.utils.data.dataloader.default_collate(batch)

from text_classification_dataset import TextClassificationDataset


In [49]:
texts = data_test['text'].tolist()
labels = data_test['Source'].tolist()

In [50]:
# Create tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [54]:
import random

# Set the desired dataset size
desired_dataset_size = 10000

# Shuffle the texts and labels
combined = list(zip(texts, labels))
random.shuffle(combined)
texts, labels = zip(*combined)

# Select a subset of texts and labels
texts = texts[:desired_dataset_size]
labels = labels[:desired_dataset_size]

# Create the dataset
train_dataset = TextClassificationDataset(texts, labels, tokenizer, max_length=max_length)


In [55]:
# Create the data loader
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    collate_fn=collate_fn
)

In [None]:
# Training loop
total_loss = 0
total_batches = 0

for batch in train_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    optimizer.zero_grad()

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item()
    total_batches += 1

    loss.backward()
    optimizer.step()

average_loss = total_loss / total_batches


In [31]:
unique_labels = data['Source'].unique()
print("Unique Labels:", unique_labels)

Unique Labels: ['abstract' 'article' 'blog' 'movie' 'reddit' 'song' 'twitter']


### Test 0.2


In [76]:

# Handle missing values in text data
data['text'].fillna("", inplace=True)

# Step 2: Feature engineering
text_data = data['text'].tolist()
target = data['Source']

# Stemming
stemmer = PorterStemmer()
stemmed_text = [' '.join([stemmer.stem(word) for word in text.split()]) for text in text_data]

# TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(stemmed_text)

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

# Step 4: Model selection and training
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Model evaluation
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred)

print("Precision:", precision)
print("Classification Report:")
print(report)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Precision: 0.9150437007119847
Classification Report:
              precision    recall  f1-score   support

    abstract       1.00      1.00      1.00       777
     article       0.95      0.96      0.95       790
        blog       0.91      0.85      0.88       811
       movie       0.95      0.84      0.89       805
      reddit       0.71      0.93      0.80       759
        song       0.94      0.88      0.91       769
     twitter       0.94      0.88      0.91       782

    accuracy                           0.90      5493
   macro avg       0.91      0.91      0.91      5493
weighted avg       0.92      0.90      0.91      5493

