In [1]:
#JSON to Processed CSV 

import os
import json
import pandas as pd
import re
import string
import numpy as np
import json
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')

# Flatten function for JSON processing
def flatten(obj, flat, prefix=''):
    for key, val in obj.items():
        if isinstance(val, dict):
            flatten(val, flat, prefix + key + '.')
        else:
            flat[prefix + key] = val

# Process JSON files and convert them to CSV
def process_json_to_csv(dirs):
    csv_files = []
    for dirname in dirs:
        data = []
        for filename in os.listdir(dirname):
            filepath = os.path.join(dirname, filename)
            with open(filepath, 'r', encoding='utf8') as f:
                flat = {}
                flatten(json.load(f), flat)
                data.append(flat)
        # Extract the specific part of the directory name
        specific_dir_name = os.path.basename(dirname)
        csv_filename = f"{specific_dir_name}_raw.csv"
        df = pd.DataFrame(data)
        df.to_csv(csv_filename, index=False)
        csv_files.append(csv_filename)
    return csv_files

# Data cleaning and formatting functions
def cleanhtml(raw_html):
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(CLEANR, '', raw_html)

def clean_text_round1(text):
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('\w*\f\w*', ' ', text)
    text = re.sub('\(.*?\)', ' ', text)
    text = re.sub('\[.*]\)', ' ', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+',' ',text)
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    return text

def clean_text_round2(text):
    text = re.sub('[‘’“”…]', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\t', ' ', text)
    return text

def process_and_format_csv(file_path):
    df = pd.read_csv(file_path)

    # Select and rename columns
    df = df[['BR.BR_id', 'BR.BR_text.BRsummary', 'BR.BR_text.BRdescription', 
             'commit.commit_id', 'commit.changed_files.file_0.file_new_name']]
    df.columns = ['BR_id', 'BRsummary', 'BRdescription', 'commit_id', 'file_new_name']

    # Data cleaning steps
    df = df.drop_duplicates().dropna(subset=['BRdescription'])
    df['BRdescription'] = df['BRdescription'].apply(cleanhtml)
    df['BRdescription'] = df['BRdescription'].apply(clean_text_round1)
    df['BRdescription'] = df['BRdescription'].apply(clean_text_round2)

    # Truncate descriptions to 1000 characters
    df['BRdescription'] = df['BRdescription'].apply(lambda x: x[:1000] if len(x) >= 1000 else x)

    # Save the formatted dataframe
    formatted_csv = file_path.replace('.csv', '_cleaned.csv')
    df.to_csv(formatted_csv, index=False)

# Define directories for JSON processing
json_dirs = ["D:/ICSME-2023/Dataset/Denchmark/JSonSet/simplejson/tensorflow+tensorflow"] 

# Process JSON files and get CSV file paths
csv_file_paths = process_json_to_csv(json_dirs)

# Process and format each CSV file
for csv_file in csv_file_paths:
    process_and_format_csv(csv_file)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sigma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sigma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# CSV to TSV 
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from dateutil import parser
import csv
import os
from dateutil import parser
from datetime import datetime
import pytz

# Ensure NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def format_datetime(datetime_str):
    # Adjust this function to handle different datetime formats and timezone info
    dt = parser.parse(datetime_str)
    if dt.tzinfo is not None:
        dt = dt.astimezone(pytz.utc)  # Convert to UTC if timezone info is present
    return dt.strftime('%Y-%m-%d %H:%M:%S')

def convert_to_timestamp(datetime_str):
    # Convert datetime string to UNIX timestamp, adjusting for timezones if necessary
    dt = parser.parse(datetime_str)
    if dt.tzinfo is not None:
        dt = dt.astimezone(pytz.utc)  # Convert to UTC if timezone info is present
    return int(dt.timestamp())

def preprocess_text(text):
    # Remove HTML tags
    no_html = re.sub('<.*?>', '', text)
    
    # Remove URLs
    no_urls = re.sub(r'http\S+', '', no_html)
    
    # Remove special characters
    no_special_chars = re.sub('[^A-Za-z0-9]+', ' ', no_urls)
    
    # Tokenize the text into individual words
    words = nltk.word_tokenize(no_special_chars)
    
    # Remove stopwords from the text
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Apply lemmatization to the remaining words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    # Remove punctuation
    no_punct = [word for word in lemmatized_words if word not in string.punctuation]
    
    # Remove digits
    no_digits = [word for word in no_punct if not re.match('\d+', word)]
    
    # Lowercase all words
    lowercase_words = [word.lower() for word in no_digits]
    
    # Join the cleaned words back into a string
    cleaned_text = " ".join(lowercase_words)
    
    return cleaned_text

def process_csv(file):
    df = pd.read_csv(file)
    
    br_id = df['BR.BR_id']
    report_time = df['BR.BRopenT']
    summary = df['BR.BR_text.BRsummary']
    description = df['BR.BR_text.BRdescription']
    commit = df['commit.commit_id']
    commit_time = df['commit.commitT']
    new_file_selected_cols = df.filter(regex='^commit\.changed_files\.file_\d+\.file_new_name$')
    new_file_selected_cols = df.apply(lambda row: ' '.join(filter(None, [str(row[col]) if str(row[col]) != 'None' and str(row[col]) != 'nan' else ' ' for col in new_file_selected_cols.columns])), axis=1)
    final_df = pd.DataFrame({"bug_id": br_id, 'report_time': report_time, 'report_timestamp': report_time, 'summary': summary, 'description': description, 'commit': commit, 'commit_timestamp': commit_time, 'files': new_file_selected_cols})
    final_df['status'] = 'resolved fixed'
    final_df['report_time'] = final_df['report_time'].apply(format_datetime)
    final_df['report_timestamp'] = final_df['report_timestamp'].apply(convert_to_timestamp)
    final_df['commit_timestamp'] = final_df['commit_timestamp'].apply(convert_to_timestamp)
    final_df['description'] = final_df['description'].apply(lambda x: preprocess_text(x))
    
    final_df = final_df[['bug_id', 'summary', 'description', 'report_time', 'report_timestamp', 
                         'status', 'commit', 'commit_timestamp', 'files']]
    final_df.to_csv(file[:-4] + '_preprocessed.csv', index_label='id')

def convert_csv_to_tsv(input_file):
    output_file = input_file.split('.')[0] + '.tsv'
    with open(input_file, 'r', encoding='utf8') as csvfile, open(output_file, 'w', newline='', encoding='utf8') as tsvfile:
        reader = csv.reader(csvfile, delimiter=',')
        writer = csv.writer(tsvfile, delimiter='\t')
        for row in reader:
            writer.writerow(row)

files = ['tensorflow+tensorflow_raw.csv']

for file in files:
    process_csv(file)
    convert_csv_to_tsv(file[:-4] + '_preprocessed.csv')

print("Processing complete.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sigma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sigma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sigma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processing complete.


In [26]:
import csv

def convert_tsv_to_txt(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as tsv_file, open(output_file, 'w', encoding='utf-8') as txt_file:
        reader = csv.reader(tsv_file, delimiter='\t')
        
        for row in reader:
            txt_file.write('\t'.join(row) + '\n')

# Example usage
tsv_files = ['pytorchlightning+pytorch-lightning_raw_preprocessed.tsv']   # Replace with your TSV filenames

for tsv_file in tsv_files:
    output_txt = tsv_file.replace('.tsv', '.txt')  # Generates a .txt filename based on the .tsv filename
    convert_tsv_to_txt(tsv_file, output_txt)

print("Conversion complete.")


Conversion complete.
