# SOCIAL MEDIA MINING - Project
## GROUP 2 : Atharva Pargaonkar, Sulbha Malviya, Sharadha K
### This file contains Data Preprocessing and crating different CSV files which are used in feature engineering.

In [1]:
# Importing required libraries
import numpy as np
import pandas as pd
import json
import os

In [2]:
# Importing the News-User Data

news_user_path = '../Data/PolitiFact/PolitiFactNewsUser.txt'
news_user = pd.read_csv(news_user_path, sep='\t', names=['NewsID', 'UserID', 'Shares'])
print(news_user.head())

   NewsID  UserID  Shares
0     240       1       1
1     124       2       1
2     162       3       1
3     233       4       1
4      50       5       1


In [3]:
# Importing the User-User Data

user_user_path = '../Data/PolitiFact/PolitiFactUserUser.txt'
user_user = pd.read_csv(user_user_path, sep='\t', names=['follower', 'followee'])
print(user_user.head())


   follower  followee
0       507         1
1      1589         1
2      5307         1
3     11421         1
4     13267         1


In [4]:
# Save both dataframes to csv files
news_user.to_csv('News_User.csv', index=False)
user_user.to_csv('User_User.csv', index=False)

In [5]:
# Importing the Ground Truth Data in News.txt
# The ground truth data is in the form of just label for each news article
ground_truth_path = '../Data/PolitiFact/News.txt'
ground_truth = pd.read_csv(ground_truth_path, sep='\t', names=['Label'])
print(ground_truth.head())


               Label
0  PolitiFact_Real_1
1  PolitiFact_Real_2
2  PolitiFact_Real_3
3  PolitiFact_Real_4
4  PolitiFact_Real_5


In [6]:
# Adding NewsID to the ground truth data and saving it to a csv file
ground_truth['NewsID'] = ground_truth.index

# Indexing the NewsID from 1
ground_truth['NewsID'] = ground_truth['NewsID'] + 1
print(ground_truth.head())

               Label  NewsID
0  PolitiFact_Real_1       1
1  PolitiFact_Real_2       2
2  PolitiFact_Real_3       3
3  PolitiFact_Real_4       4
4  PolitiFact_Real_5       5


In [7]:
print(ground_truth.tail())

                   Label  NewsID
235  PolitiFact_Fake_116     236
236  PolitiFact_Fake_117     237
237  PolitiFact_Fake_118     238
238  PolitiFact_Fake_119     239
239  PolitiFact_Fake_120     240


In [8]:
# Convert JSON files in a specified directory to a CSV file
import json
import os   
import re

def convert_json_to_csv(input_folder, csv_output_path):
    collected_data = []
    all_columns = set()
    
    # Process each JSON file in the folder
    for file in os.listdir(input_folder):
        if file.endswith('.json'):
            file_path = os.path.join(input_folder, file)
            with open(file_path, 'r') as json_file:
                content = json.load(json_file)
                file_identifier = re.sub(r'(\d+).*', r'\1', file)
                content['filename'] = file_identifier
                all_columns.update(content.keys())
                collected_data.append(content)
                
    # Convert to DataFrame, ensuring consistent columns
    csv_data = pd.DataFrame(collected_data)
    csv_data = csv_data.reindex(columns=all_columns)
    
    # Ensure 'filename' is the first column
    if 'filename' in csv_data.columns:
        csv_data.insert(0, 'filename', csv_data.pop('filename'))
        
    csv_data.to_csv(csv_output_path, index=False)
    print(f'CSV file saved to {csv_output_path}')


In [9]:
# Convert both fake and real news JSON directories to CSVs
convert_json_to_csv('../Data/PolitiFact/FakeNewsContent/', '../Data/FakeNewsContent.csv')
convert_json_to_csv('../Data/PolitiFact/RealNewsContent/', '../Data/RealNewsContent.csv')

CSV file saved to ../Data/FakeNewsContent.csv
CSV file saved to ../Data/RealNewsContent.csv


In [10]:
# Merge the CSV files into a single DataFrame
def merge_csv_files(fake_csv, real_csv, output_merged_csv):
    fake_df = pd.read_csv(fake_csv)
    real_df = pd.read_csv(real_csv)
    combined_df = pd.concat([fake_df, real_df], ignore_index=True)
    combined_df.to_csv(output_merged_csv, index=False)
    print(f"Combined CSV saved to {output_merged_csv}")

merge_csv_files('../Data/FakeNewsContent.csv', '../Data/RealNewsContent.csv', '../Data/Merged_JSON_News.csv')


Combined CSV saved to ../Data/Merged_JSON_News.csv


In [11]:
# Add NewsID to the merged content
def add_news_id_to_content(merged_csv_path, news_csv_path, final_csv_output):
    content_df = pd.read_csv(merged_csv_path)
    news_info_df = pd.read_csv(news_csv_path)
    merged_with_ids = pd.merge(content_df, news_info_df[['filename', 'NewsID']], on='filename', how='left')
    
    # Ensure 'NewsID' is the first column and sort by it
    reordered_columns = ['NewsID'] + [col for col in merged_with_ids.columns if col != 'NewsID']
    final_df = merged_with_ids[reordered_columns].sort_values(by='NewsID')
    
    final_df.to_csv(final_csv_output, index=False)
    print(f"Final content with NewsID saved to {final_csv_output}")

# Use the function to add NewsID to merged content
add_news_id_to_content('../Data/Merged_JSON_News.csv', '../Data/News.csv', '../Data/JSON_News.csv')

Final content with NewsID saved to ../Data/JSON_News.csv


In [13]:
# Display the first few rows of the final content
JSON_News = pd.read_csv('../Data/JSON_News.csv')

JSON_News.head()

Unnamed: 0,NewsID,filename,title,authors,text,publish_date,images,source,keywords,canonical_link,summary,movies,meta_data,top_img,url
0,1,PolitiFact_Real_1,Trump Just Insulted Millions Who Lost Everythi...,"['Brett Bose', 'Grant Stern', 'Steve Bernstein...",16.8k SHARES SHARE THIS STORY\n\nHillary Clint...,{'$date': 1474934400000},['http://occupydemocrats.com/wp-content/upload...,http://occupydemocrats.com,[],http://occupydemocrats.com/2016/09/27/trump-ju...,,[],{'generator': 'Powered by Visual Composer - dr...,http://occupydemocrats.com/wp-content/uploads/...,http://occupydemocrats.com/2016/09/27/trump-ju...
1,2,PolitiFact_Real_2,Trump Campaign Chair Says Racism Didn't Exist ...,"['Colin Taylor', 'Grant Stern', 'Brett Bose', ...",24.8k SHARES SHARE THIS STORY\n\nRepublican no...,{'$date': 1474502400000},['http://occupydemocrats.com/wp-content/upload...,http://occupydemocrats.com,[],http://occupydemocrats.com/2016/09/22/unhinged...,,[],{'generator': 'Powered by Visual Composer - dr...,http://occupydemocrats.com/wp-content/uploads/...,http://occupydemocrats.com/2016/09/22/unhinged...
2,3,PolitiFact_Real_3,'Locked the black bitch out': White students u...,,The University of North Dakota is investigatin...,,,,,,,,,,
3,4,PolitiFact_Real_4,Massive Protests Erupt In North Carolina After...,"['Colin Taylor', 'Grant Stern', 'Brett Bose', ...",12k SHARES SHARE THIS STORY\n\nMassive protest...,{'$date': 1474416000000},['http://occupydemocrats.com/wp-content/upload...,http://occupydemocrats.com,[],http://occupydemocrats.com/2016/09/21/massive-...,,[],{'generator': 'Powered by Visual Composer - dr...,http://occupydemocrats.com/wp-content/uploads/...,http://occupydemocrats.com/2016/09/21/massive-...
4,5,PolitiFact_Real_5,Howard Dean ROASTS Trump Over Debate Sniffles:...,['Ryan Denson'],"In the first half of the debate, Donald Trump ...",{'$date': 1474934400000},"['http://i.imgur.com/JeqZLhj.png', 'https://d5...",http://addictinginfo.org,[],http://addictinginfo.com/2016/09/27/howard-dea...,,[],{'publisher': 'Addicting Info | The Knowledge ...,http://addictinginfo.addictinginfoent.netdna-c...,http://addictinginfo.org/2016/09/27/howard-dea...
