In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from daal4py.sklearn.ensemble import RandomForestRegressor
from daal4py.sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Load the train and test datasets using Pandas
train_data = pd.read_csv("openAPITrain.csv")
test_data = pd.read_csv("openAPITest.csv")

In [3]:
# Handle missing values (if any)
train_data.fillna('', inplace=True)
test_data.fillna('', inplace=True)

In [4]:
# Check for missing values in training data
test_data.isnull().sum()

IDLink         0
Title          0
Headline       0
Source         0
Topic          0
PublishDate    0
Facebook       0
GooglePlus     0
LinkedIn       0
dtype: int64

In [5]:
# Create LabelEncoder object
le = LabelEncoder()
# Encode the categorical features in training data
train_data['Topic'] = le.fit_transform(train_data['Topic'])
train_data['Source'] = le.fit_transform(train_data['Source'])
# Encode the categorical features in testing data
test_data['Topic'] = le.fit_transform(test_data['Topic'])
test_data['Source'] = le.fit_transform(test_data['Source'])

In [6]:
# Convert the timestamp to a suitable format
train_data['PublishDate'] = pd.to_datetime(train_data['PublishDate'])
test_data['PublishDate'] = pd.to_datetime(test_data['PublishDate'])

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Convert to lowercase and remove punctuation
    words = [token.lower() for token in tokens if token not in string.punctuation]

    # Remove stopwords
    words_filtered = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words_filtered]

    return ' '.join(lemmatized_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/u190070/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/u190070/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/u190070/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Preprocess text data (Title and Headline columns)
train_data['Title'] = train_data['Title'].apply(preprocess_text)
train_data['Headline'] = train_data['Headline'].apply(preprocess_text)
test_data['Title'] = test_data['Title'].apply(preprocess_text)
test_data['Headline'] = test_data['Headline'].apply(preprocess_text)

In [9]:
from textblob import TextBlob

title_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
headline_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')

def create_features(df, fit_vectorizers = False):
    # Word count for Title and Headline
    df['Title_word_count'] = df['Title'].apply(lambda x: len(x.split()))
    df['Headline_word_count'] = df['Headline'].apply(lambda x: len(x.split()))

    # Sentiment polarity and subjectivity for Title and Headline
    df['Title_polarity'] = df['Title'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['Title_subjectivity'] = df['Title'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    df['Headline_polarity'] = df['Headline'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['Headline_subjectivity'] = df['Headline'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

    global title_vectorizer, headline_vectorizer
    if fit_vectorizers:
        title_vectorizer.fit(df['Title'])
        headline_vectorizer.fit(df['Headline'])
    
    title_tfidf = title_vectorizer.transform(df['Title']).toarray()
    headline_tfidf = headline_vectorizer.transform(df['Headline']).toarray()

    # Combine the existing features with the new TF-IDF features
    title_tfidf_df = pd.DataFrame(title_tfidf, columns=[f'Title_tfidf_{i}' for i in range(title_tfidf.shape[1])], index=df.index)
    headline_tfidf_df = pd.DataFrame(headline_tfidf, columns=[f'Headline_tfidf_{i}' for i in range(headline_tfidf.shape[1])], index=df.index)

    df = pd.concat([df, title_tfidf_df, headline_tfidf_df], axis=1)

    return df

In [10]:
# Create new features from text data
train_data = create_features(train_data, fit_vectorizers=True)
test_data = create_features(test_data, fit_vectorizers=True)

In [11]:
# Normalize the numerical features
scaler = MinMaxScaler()
train_data[['Facebook', 'GooglePlus', 'LinkedIn']] = scaler.fit_transform(train_data[['Facebook', 'GooglePlus', 'LinkedIn']])
test_data[['Facebook', 'GooglePlus', 'LinkedIn']] = scaler.transform(test_data[['Facebook', 'GooglePlus', 'LinkedIn']])

In [12]:
from sklearn.model_selection import train_test_split

X = train_data.drop(['IDLink', 'PublishDate', 'Title', 'Headline', 'SentimentTitle', 'SentimentHeadline'], axis=1)
y_test = test_data.drop(['IDLink', 'PublishDate', 'Title', 'Headline'], axis=1)
y_title = train_data['SentimentTitle']
y_headline = train_data['SentimentHeadline']

X_train, X_test, y_title_train, y_title_test, y_headline_train, y_headline_test = train_test_split(X, y_title, y_headline, test_size=0.2, random_state=42)


In [13]:
X

Unnamed: 0,Source,Topic,Facebook,GooglePlus,LinkedIn,Title_word_count,Headline_word_count,Title_polarity,Title_subjectivity,Headline_polarity,...,Headline_tfidf_90,Headline_tfidf_91,Headline_tfidf_92,Headline_tfidf_93,Headline_tfidf_94,Headline_tfidf_95,Headline_tfidf_96,Headline_tfidf_97,Headline_tfidf_98,Headline_tfidf_99
0,3560,2,0.000000,0.000000,0.000000,6,14,0.000000,0.000000,-0.100000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,376,0,0.000000,0.000000,0.000000,4,15,0.000000,0.000000,0.100000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,376,0,0.000000,0.000000,0.000000,6,14,0.000000,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2610,0,0.000000,0.000000,0.000000,4,18,0.000000,0.000000,-0.166667,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3286,0,0.000000,0.000000,0.000000,7,16,0.000000,0.000000,0.133333,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40503,1469,2,0.016627,0.060726,0.005201,9,16,0.100000,0.100000,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40504,3398,3,0.000245,0.000789,0.000274,8,17,0.400000,0.800000,0.200000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40505,2564,3,0.000539,0.012618,0.000274,7,13,0.000000,0.000000,0.000000,...,0.431484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40506,546,2,0.001469,0.001577,0.000274,7,19,0.136364,0.454545,0.500000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Train Ridge Regression and Random Forest Regressor models
ridge_title = Ridge().fit(X_train, y_title_train)
ridge_headline = Ridge().fit(X_train, y_headline_train)

In [15]:
rf_title = RandomForestRegressor(n_estimators=500, max_depth=5, random_state=42).fit(X_train, y_title_train)
rf_headline = RandomForestRegressor(n_estimators=500, max_depth=5, random_state=42).fit(X_train, y_headline_train)

In [16]:
# Define base estimators for stacking
base_estimators = [
    ('ridge_title', Ridge(alpha = 10.0)),
    ('ridge_headline', Ridge(alpha = 10.0)),
    ('rf_title', RandomForestRegressor(n_estimators=500, max_depth=5, random_state=42)),
    ('rf_headline', RandomForestRegressor(n_estimators=500, max_depth=5, random_state=42))
]

In [17]:
# Create Stacking Regressor for both SentimentTitle and SentimentHeadline
stacking_title = StackingRegressor(estimators=base_estimators, final_estimator=Ridge())
stacking_headline = StackingRegressor(estimators=base_estimators, final_estimator=Ridge())

In [None]:
# Train the stacking models
stacking_title.fit(X_train, y_title_train)
stacking_headline.fit(X_train, y_headline_train)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# Evaluate the models' performance using Stacking Regressor
stacking_title_pred = stacking_title.predict(X_test)
stacking_headline_pred = stacking_headline.predict(X_test)

mse_stacking_title = mean_squared_error(y_title_test, stacking_title_pred)
mse_stacking_headline = mean_squared_error(y_headline_test, stacking_headline_pred)

print(f'MSE Stacking Title: {mse_stacking_title}')
print(f'MSE Stacking Headline: {mse_stacking_headline}')


In [None]:
# Use the trained and optimized models to predict SentimentTitle and SentimentHeadline for the test dataset
test_data['SentimentTitle'] = stacking_title.predict(y_test)
test_data['SentimentHeadline'] = stacking_headline.predict(y_test)

In [None]:
test_data[['IDLink', 'SentimentTitle', 'SentimentHeadline']].to_csv("output.csv", index=False)

In [None]:
test_data[['IDLink', 'SentimentTitle', 'SentimentHeadline']]