In [35]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.svm import SVC
from sklearn.utils import shuffle

In [4]:
# Data import
data = pd.read_csv('Data/balenced_data.csv')

## Tokenization and stemming

In [33]:
# Preprocess function to clean and preprocess text

def preprocess_text(text):
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenization
    tokens = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Join tokens back into a single string
    processed_text = ' '.join(tokens)
    return processed_text
# Example usage
# text = "This is an example sentence."
# processed_text = preprocess_text(text)
# print(processed_text)


## Vectorization

In [12]:
# The following code will cause memory error: 
# Unable to allocate 45.7 GiB for an array with shape (26865, 228492) and data type float64

# Drop rows with missing values in the 'text' column
# data = data.dropna(subset=['text'])
# Create a TF-IDF vectorizer
# vectorizer = TfidfVectorizer()
# Fit and transform the preprocessed text data
# features = vectorizer.fit_transform(data['text'])
# Convert features to a dense matrix
# features = features.todense()
# Example usage
# print(features.shape)


Due to the above mentioned memory problem, we manually limited the maximum features extracted. Limiting the number of features can potentially impact the model's performance, as it reduces the amount of information available for training.
By setting a maximum number of features, we are essentially reducing the dimensionality of the feature space. This can help mitigate memory constraints and improve computational efficiency. However, it also means that some potentially relevant features may be discarded, which can result in a loss of information. It's important to strike a balance between reducing dimensionality for efficiency purposes and retaining enough informative features for effective model training. The optimal number of features may vary depending on the specific dataset and problem domain, so it's worth experimenting with different feature subset sizes to find the most suitable configuration.

In [21]:

# Shuffle the data to get a random subset
shuffled_data = shuffle(data, random_state=42)
# Set the maximum number of features and the sample size
max_features = 100000
sample_size = 26865
# Create a TF-IDF vectorizer with limited features
vectorizer = TfidfVectorizer(max_features=max_features)
# Sample a smaller subset of the data for feature extraction
sampled_data = shuffled_data.sample(n=sample_size, random_state=42)
# Fit and transform the preprocessed text data
features = vectorizer.fit_transform(sampled_data['text'])
# Example usage
print(features.shape)


(26865, 100000)


## Model training and evaluation

In [24]:
# Split the features and labels into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(features, data['Source'], test_size=0.2, random_state=42)
# Example usage
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(21492, 100000) (21492,)
(5373, 100000) (5373,)


### MultinominalNB als Model

In [25]:
# Create a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
# Train the classifier
classifier.fit(X_train, y_train)
# Example usage


In [26]:
# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Evaluate the classifier
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

    abstract       0.14      0.35      0.20       792
     article       0.15      0.08      0.10       801
        blog       0.16      0.19      0.17       775
       movie       0.06      0.00      0.00       671
      reddit       0.13      0.15      0.14       768
        song       0.17      0.11      0.13       786
     twitter       0.14      0.12      0.13       780

    accuracy                           0.14      5373
   macro avg       0.13      0.14      0.12      5373
weighted avg       0.14      0.14      0.13      5373



### Support Vector Machines (SVM)

In [36]:
# Create an SVM classifier
classifier = SVC()
# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Evaluate the classifier
report = classification_report(y_test, y_pred)
print(report)

### Random Forest

In [None]:
# Create a Random Forest classifier
classifier = RandomForestClassifier()

# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Evaluate the classifier
report = classification_report(y_test, y_pred)
print(report)


### Gradient Boosting

In [None]:

# Create a Gradient Boosting classifier
classifier = GradientBoostingClassifier()

# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = classifier.predict(X_test)

# Evaluate the classifier
report = classification_report(y_test, y_pred)
print(report)


### Neural network

In [None]:
# Create a sequential model
model = Sequential()
# Add a dense layer with ReLU activation
model.add(Dense(128, activation='relu', input_shape=(input_dim,)))
# Add dropout for regularization
model.add(Dropout(0.5))
# Add another dense layer with ReLU activation
model.add(Dense(64, activation='relu'))
# Add dropout for regularization
model.add(Dropout(0.5))
# Add the final dense layer with softmax activation for multi-class classification
model.add(Dense(num_classes, activation='softmax'))
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


In [27]:
# Define the parameter grid for grid search
param_grid = {
    'alpha': [0.1, 1.0, 10.0]
}

# Perform grid search to find the best parameters
grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best classifier from grid search
best_classifier = grid_search.best_estimator_

# Example usage
print(best_classifier)


MultinomialNB(alpha=10.0)


In [32]:
# Preprocess and extract features for new text
new_text = "This is a new text to classify."
processed_new_text = preprocess_text(new_text)
new_features = vectorizer.transform([processed_new_text])
# Convert new_features to a dense matrix and then to a numpy array
new_features_array = np.asarray(new_features.toarray())
# Predict the source category for the new text
predicted_category = best_classifier.predict(new_features_array)
# Example usage
print(predicted_category)


['reddit']
