<a href="https://colab.research.google.com/github/ShubhamLolge/Applications-of-ML/blob/main/AML_part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
# Mount Google Drive if the dataset is stored there
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
# Define the path to the dataset folder
# data_path = "/content/drive/MyDrive/datasets_coursework1/bbc"

# Read the text files from each category folder and preprocess the data
# data = []
# categories = ['tech', 'business', 'sport', 'politics', 'entertainment']
# print(data_path)
# for category in categories:
#     category_path = os.path.join(data_path, category)

#     files = os.listdir(category_path)
#     for file in files:
#         with open(os.path.join(category_path, file), 'r', encoding='utf-8') as f:
#             text = f.read()
#             preprocessed_text = preprocess_text(text)
#             data.append({'text': preprocessed_text, 'category': category})

data_path = "/content/drive/MyDrive/datasets_coursework1/bbc"

data = []
categories = ['tech', 'business', 'sport', 'politics', 'entertainment']
for category in categories:
    category_path = os.path.join(data_path, category)
    files = os.listdir(category_path)
    for file in files:
        with open(os.path.join(category_path, file), 'r', encoding='latin-1') as f:
            text = f.read()
            preprocessed_text = preprocess_text(text)
            data.append({'text': preprocessed_text, 'category': category})

# Create a DataFrame from the preprocessed data
df = pd.DataFrame(data)

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("Data loaded, preprocessed, and split into training and testing sets successfully.")

Data loaded, preprocessed, and split into training and testing sets successfully.


In [26]:
# Text Preprocessing

# Tokenization using NLTK
from nltk.tokenize import word_tokenize

# Remove stop words using NLTK
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Apply stemming using NLTK
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# Preprocess text function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Apply stemming
    tokens = [stemmer.stem(token) for token in tokens]
    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply preprocessing to the entire DataFrame
df['preprocessed_text'] = df['text'].apply(preprocess_text)


In [27]:
# Feature Extraction

# Word Frequency Feature using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the preprocessed text data
X_count = count_vectorizer.fit_transform(df['preprocessed_text'])

# TF-IDF Feature using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text data
X_tfidf = tfidf_vectorizer.fit_transform(df['preprocessed_text'])


In [28]:
# Feature Selection

# Chi-Square Test for Feature Selection
from sklearn.feature_selection import chi2

# Compute chi-squared statistics for feature selection
chi2_stat, p_values = chi2(X_count, df['category'])

# Print the chi-squared statistics
print("Chi-squared statistics for feature selection:")
print(chi2_stat)

# Mutual Information for Feature Selection
from sklearn.feature_selection import mutual_info_classif

# Compute mutual information for feature selection
mutual_info = mutual_info_classif(X_count, df['category'])

# Print the mutual information scores
print("Mutual information scores for feature selection:")
print(mutual_info)

Chi-squared statistics for feature selection:
[  2.34543876 178.9958664    4.33573141 ...   4.7642487   13.41682975
   9.52849741]
Mutual information scores for feature selection:
[0.0008007  0.04241952 0.00075299 ... 0.00078775 0.00132373 0.00157647]


In [30]:
# Model Selection and Training

# # Choose a classifier (e.g., Naive Bayes)
# from sklearn.naive_bayes import MultinomialNB

# # Initialize the classifier
# clf = MultinomialNB()

# # Train the classifier on the training set
# clf.fit(X_tfidf, train_df['category'])

# Split the dataset into training and testing sets before any preprocessing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Preprocess text and extract features from the training set
train_df['preprocessed_text'] = train_df['text'].apply(preprocess_text)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['preprocessed_text'])

# Train the classifier on the training set
clf.fit(X_train_tfidf, train_df['category'])


In [32]:
# Model Evaluation

# # Evaluate the classifier on the test set
# accuracy = clf.score(count_vectorizer.transform(test_df['preprocessed_text']), test_df['category'])
# print("Accuracy on the test set:", accuracy)

# Preprocess text and extract features from the test set using the same vectorizers as the training set
test_df['preprocessed_text'] = test_df['text'].apply(preprocess_text)
X_test_tfidf = tfidf_vectorizer.transform(test_df['preprocessed_text'])

# Evaluate the classifier on the test set
accuracy = clf.score(X_test_tfidf, test_df['category'])
print("Accuracy on the test set:", accuracy)

Accuracy on the test set: 0.950561797752809
