In [None]:
import pandas as pd
import numpy as np
import json
import ijson
import requests
from bs4 import BeautifulSoup
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt


In [None]:
#podatki iz jsona

file_path = "./News_Category_Dataset_IS_course.json"
# file_path = "./test.json"

data = [json.loads(line, object_hook=lambda o: str(o) if isinstance(o, (str, type(None)))else o) for line in open(file_path, 'r')]

data

In [None]:
#naredi dataframe
df = pd.DataFrame(data)

categories = df['category'].value_counts()

num_categories = len(categories)
print(categories)

In [None]:
# adds the whole story to the dataframe

def add_story(df):
    text_column = []

    for i in range(15360, len(df)):
        # print(f"index: {i}")
        short_description = df['short_description'].iloc[i]

        if (type(short_description) != str):
            # print(short_description)
            print(f"index {i}")
            link = df['link'].iloc[i]
            try:
                response = requests.get(link)
                
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')

                    # section = soup.find('section')
                    cur_text_arr = []
                    all_data_article = soup.find_all('div', class_='primary-cli') #all the text data in article that could be useful (the last few aren't)
                    for i in range(len(all_data_article) - 2):
                        k = all_data_article[i]
                        cur_text_arr.append(k.text)
                    current_string = " ".join(cur_text_arr)

                    df.at[i, 'short_description'] = current_string
            
            except:
                continue
                # text_column.append(current_string)

    

    # df['story'] = text_column
    return df




# links

stories = add_story(df)

stories


In [None]:
stories.to_csv("fixed_data.csv", index=False)

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()



# Function to preprocess text
def preprocess_text(text):
    # Tokenize the text into words
    text = str(text)
    words = word_tokenize(text.lower())  # Convert text to lowercase

    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Stemming (uncomment if you want to use stemming)
    stemmed_words = [stemmer.stem(word) for word in words]

    # Join the words back into a string
    preprocessed_text = ' '.join(lemmatized_words)
    return preprocessed_text




In [None]:
df = pd.read_csv('./fixed_data.csv', sep=',')
# df
# df['cleaned_text'] = df['short_description'].apply(preprocess_text)
# df['cleaned_headline'] = df['headline'].apply(preprocess_text)

In [None]:
df['cleaned_information'] = df['cleaned_text'].astype(str).str.cat(df['cleaned_headline'].astype(str),sep=' ')

In [None]:
df.to_csv("fixed_data.csv", index=False)

In [None]:
data = pd.read_csv("./fixed_data.csv", sep=',')
data = data.sample(frac=1, random_state=42)
data = data.dropna()
# data = data.fillna('')

# data['cleaned_text'] = data['cleaned_text'].fillna('')


data['cleaned_information']
# data['headline']



In [None]:

# Assuming 'data' is your DataFrame and 'category' is the column of interest
category_counts = data['category'].value_counts()

# Plotting the pie chart with percent values
category_counts.plot(kind='pie', figsize=(6, 6), autopct='%1.1f%%')

plt.title('Category Distribution')
plt.ylabel('')  # Remove y-axis label for better clarity
plt.show()

In [None]:

X_train, X_test, y_train, y_test = train_test_split(data[['cleaned_information', 'short_description', 'headline']], data['category'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()  # Use TF-IDF vectorizer for text to numerical feature conversion
X_train_vec = vectorizer.fit_transform(X_train['cleaned_information'])
X_test_vec = vectorizer.transform(X_test['cleaned_information'])

tokenized_train_text = [text.split() for text in X_train['cleaned_information']]
tokenized_test_text = [text.split() for text in X_test['cleaned_information']]


# tokenized_test_text

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_vec, y_train)
logistic_predictions = logistic_model.predict(X_test_vec)
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
print("Logistic Regression Accuracy:", logistic_accuracy)




# Random Forest model slabsi je
# rf_model = RandomForestClassifier()
# rf_model.fit(X_train_vec, y_train)
# rf_predictions = rf_model.predict(X_test_vec)
# rf_accuracy = accuracy_score(y_test, rf_predictions)
# print("Random Forest Accuracy:", rf_accuracy)


In [None]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(tokenized_train_text, vector_size=100, window=5, min_count=1, workers=6, epochs=10)



In [None]:
all_words = w2v_model.wv.index_to_key


categories = [cat.lower() for cat in data['category'].unique().tolist()]
print(categories)

word_vectors_dict = {word: w2v_model.wv[word] for word in all_words}
# category_vectors = [w2v_model.wv[word] for word in categories]



In [None]:
word_vectors = [w2v_model.wv[word] for word in all_words]
print(word_vectors)


# logistic_model = LogisticRegression()
# logistic_model.fit(word_vectors, y_train)
# logistic_predictions = logistic_model.predict(X_test_vec)
# logistic_accuracy = accuracy_score(y_test, logistic_predictions)
# print("Logistic Regression Accuracy:", logistic_accuracy)

In [None]:
def text_to_vector(text, model):
    words = word_tokenize(text.lower())
    vectors = [model.wv[word] for word in words if word in model.wv]
    if not vectors:
        return None
    return sum(vectors) / len(vectors)

tokenized_texts = [word_tokenize(text.lower()) for text in data['cleaned_information']]
model = Word2Vec(tokenized_texts, vector_size=100, window=20, min_count=1, workers=8, epochs=15)
# Assuming 'texts' is a list of sentences
vectors = [text_to_vector(text, model) for text in data['cleaned_information']]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Assuming 'labels' is a list of class labels
cat = data['category']
# print(cat, len(cat), len(vectors))
filter_vec = list(filter(lambda v: v is not None, vectors))
filter_cat = [cat.iloc[i] for i,v in enumerate(vectors) if v is not None]
X_train, X_test, y_train, y_test = train_test_split(filter_vec, np.array(filter_cat), test_size=0.2, random_state=42)

# Train a classifier
classifier = LogisticRegression(max_iter=4000)


classifier.fit(X_train, y_train)
cv_scores = cross_val_score(classifier, X_train, y_train, cv=7)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean():.2f}')

# Make predictions
predictions = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


In [None]:
label_mapping = {'POLITICS': 0, 'TRAVEL':1, 'WELLNESS':2, 'BUSINESS':3, 'BLACK VOICES':4, 'HEALTHY LIVING':5,
 'ENTERTAINMENT':6, 'QUEER VOICES':7, 'COMEDY':8, 'STYLE & BEAUTY':9, 'FOOD & DRINK':10,
 'SPORTS':11, 'HOME & LIVING':12, 'PARENTING':13, 'PARENTS':14}
train_data_1 = pd.DataFrame()
train_data_1['category'] = data['category'].map(label_mapping).copy()
train_data_1['category'].unique()


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split


# cat = data['category']
# print(cat, len(cat), len(vectors))
filter_vec = list(filter(lambda v: v is not None, vectors))
# filter_cat = [cat.iloc[i] for i,v in enumerate(vectors) if v is not None]
X_train, X_test, y_train, y_test = train_test_split(filter_vec, np.array(train_data_1['category']), test_size=0.2, random_state=42)
# print(X_train)
model = XGBClassifier()
model.fit(X_train,y_train)

predictions = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

In [None]:
from sklearn.model_selection import GridSearchCV

paramter_grid = {
    'learning_rate': [0.01,0.1,0.2],
    'n_estimators': [50,100,200],
    'max_depth': [3,4,5]
}

grid_search = GridSearchCV(estimator=model,param_grid=paramter_grid,cv=3,scoring='accuracy',n_jobs=8)
grid_search.fit(X_train,y_train)
best_paramters = grid_search.best_params_


final_model = XGBClassifier(**best_paramters)
final_model.fit(X_train,y_train)

make_predictions = final_model.predict(X_test)
accuracy = accuracy_score(y_test,make_predictions)

print("Best Hyperparameters:", best_paramters)
print("Test Set Accuracy:", accuracy)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Extract text and labels
# texts = [doc['cleaned_information'] for doc in data]
# labels = [doc['category'] for doc in data]
texts = data['cleaned_information']
labels = data['category']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
# Create a Bag-of-Words representation of the text
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)


param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0], 
}

# Train a classifier (for example, Naive Bayes)
classifier = MultinomialNB()
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_bow,y_train)

# Print the best parameters and corresponding accuracy
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Cross-Validation Accuracy: {grid_search.best_score_:.2f}')


# classifier.fit(X_train_bow, y_train)

# Get the best classifier from grid search
best_classifier = grid_search.best_estimator_

# Make predictions on the test set
predictions = best_classifier.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print(classification_report(y_test, predictions))

In [None]:
texts = ["Lets watch tennis"]
X_test_bow = vectorizer.transform(texts)

predictions = best_classifier.predict(X_test_bow)
print(predictions)



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Extract text and labels
# texts = [doc['cleaned_information'] for doc in data]
# labels = [doc['category'] for doc in data]
texts = data['cleaned_information']
labels = data['category']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
# Create a Bag-of-Words representation of the text
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)


param_grid = {
    'n_estimators': [50, 100, 150], 
    'max_depth': [None,10, 20, 30],  
}

# Train a classifier (for example, Naive Bayes)
classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_bow,y_train)

# Print the best parameters and corresponding accuracy
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Cross-Validation Accuracy: {grid_search.best_score_:.2f}')


# classifier.fit(X_train_bow, y_train)

# Get the best classifier from grid search
best_classifier = grid_search.best_estimator_

# Make predictions on the test set
predictions = best_classifier.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print(classification_report(y_test, predictions))