In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from deap import base, creator, tools, algorithms

# Download stopwords (only need to run this once)
nltk.download('stopwords')

# Read the data from the CSV file
data = pd.read_csv("data.csv")

# Combine 'Headline' and 'Body' into a new column 'News'
data['News'] = data['Headline'] + " " + data['Body']

# Drop the features that are not needed
features_dropped = ['URLs', 'Headline', 'Body']
data = data.drop(features_dropped, axis=1)

# Text processing (cleaning)
ps = PorterStemmer()
def wordopt(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    return text

data['News'] = data['News'].apply(wordopt)

# Split the data into training and testing sets
X = data['News']
Y = data['Label']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

# Vectorization of data
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

# Genetic Algorithm for feature selection
def evaluate_individual(individual):
    selected_features = [feature for feature, selected in zip(vectorization.get_feature_names(), individual) if selected]
    xv_train_individual = xv_train[:, [idx for idx, selected in enumerate(individual) if selected]]
    xv_test_individual = xv_test[:, [idx for idx, selected in enumerate(individual) if selected]]
    
    svm_model = SVC(kernel='linear')
    svm_model.fit(xv_train_individual, y_train)
    y_pred = svm_model.predict(xv_test_individual)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy,

# DEAP setup
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(vectorization.get_feature_names()))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", evaluate_individual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Genetic Algorithm parameters
population_size = 50
generations = 20
crossover_probability = 0.5
mutation_probability = 0.2

population = toolbox.population(n=population_size)

# Evaluate the entire population
fitnesses = list(map(toolbox.evaluate, population))
for ind, fit in zip(population, fitnesses):
    ind.fitness.values = fit

for generation in range(generations):
    offspring = algorithms.varAnd(population, toolbox, cxpb=crossover_probability, mutpb=mutation_probability)

    fitnesses = list(map(toolbox.evaluate, offspring))
    for ind, fit in zip(offspring, fitnesses):
        ind.fitness.values = fit

    population = toolbox.select(offspring, k=population_size)

best_individual = tools.selBest(population, k=1)[0]
selected_features = [feature for feature, selected in zip(vectorization.get_feature_names(), best_individual) if selected]

print("Selected features:", selected_features)


[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'