# Dependencies

In [10]:
import pandas as pd
import cleantext
import csv

### cleaning med clean

In [11]:
df = pd.read_csv('data_test.csv')
print(df['content'].head(5))

df['content'] = df['content'].apply(lambda x: cleantext.clean(text=x))
print(df['content'].head(5))

0    Sometimes the power of Christmas will make you...
1    AWAKENING OF 12 STRANDS of DNA – “Reconnecting...
2    Never Hike Alone: A Friday the 13th Fan Film U...
3    When a rare shark was caught, scientists were ...
4    Donald Trump has the unnerving ability to abil...
Name: content, dtype: object
0    sometim power christma make wild wonder thing ...
1    awaken strand dna – “reconnect you” movi reade...
2    never hike alon friday th fan film usa min fan...
3    rare shark caught scientist left blunder answe...
4    donald trump unnerv abil abil creat realiti co...
Name: content, dtype: object


# Data cleaning and structuring

### cleaning med clean_words

In [12]:
df['content'] = df['content'].apply(lambda x: cleantext.clean_words(
    text=x,
    clean_all=True,
    extra_spaces=True,
    stemming=True,
    stopwords=True,
    stp_lang='english',
))

print(df['content'].head(5))

0    [sometim, power, christma, make, wild, wonder,...
1    [awaken, strand, dna, –, “reconnect, you”, mov...
2    [never, hike, alon, friday, th, fan, film, usa...
3    [rare, shark, caught, scientist, left, blunder...
4    [donald, trump, unnerv, abil, abil, creat, rea...
Name: content, dtype: object


### Export

In [13]:
df.to_csv('data_cleaned.csv')

# Categorization and splitting

In [14]:
temp = []  # Temporary list to store filtered rows
categories = set()  # Set to track unique categories
convert = {  # Dictionary to map categories to standardized labels
    '': "", 'conspiracy': 'fake', 'satire': 'fake', 'reliable': 'reliable',
    'unreliable': 'skip', 'junksci': 'fake', 'unknown': 'skip',
    'political': 'skip', 'fake': 'fake', 'hate': 'fake',
    'clickbait': 'reliable', 'bias': 'skip', 'rumor': 'fake'
}

fakeCount = 0  # Counter for fake news articles
realCount = 0  # Counter for reliable news articles

with open("data_cleaned.csv", "r") as src:  # Open CSV file for reading
    reader = csv.reader(src)  # Create a CSV reader object
    header = next(reader)  # Read and discard the header row
    
    for row in reader:  
        content = row[4]  # Extract category from the fourth column
        categories.add(content)  # Store the unique category
        
        row[4] = convert[row[4]]  # Convert category using the dictionary
        
        if row[4] == "fake":  
            fakeCount += 1  # Increment fake count
        elif row[4] == "reliable":
            realCount += 1  # Increment real count
        
        if row[4] != "skip":  # Skip unwanted categories
            temp.append(row)  # Append valid rows to the list

# Create a DataFrame from the filtered list, keeping the original column names
df = pd.DataFrame(temp, columns=header)

# Save the processed data to a new CSV file without renaming columns
df.to_csv('data_cleaned_fr.csv', index=False)

# Print the ratio of fake vs. real news articles and number of articles
print(f"Ratio of fakes: {fakeCount/(fakeCount+realCount)}")
print(f"Ratio of real: {realCount/(fakeCount+realCount)}")
print(f"Number of articles left: {len(temp)}")

Ratio of fakes: 0.9796954314720813
Ratio of real: 0.02030456852791878
Number of articles left: 209


In [15]:
# Load the dataset
train_data = pd.read_csv("output_file.csv")

# Prepare the data for the bag of words model
X = train_data['content']  # Text data
y = train_data['type']  # Labels (fake or reliable)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CountVectorizer to convert text into a bag of words
vectorizer = CountVectorizer(stop_words='english', max_features=5000)  # You can adjust max_features
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

NameError: name 'train_test_split' is not defined

# Training and evaluation