In [2]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('withAuthor_comments_apache.csv')

# Filter rows where Prediction column is 1
df_positive = df[df['prediction'] == 1]

# Check if there are enough rows to create three sets of 4000 rows each
total_positive_rows = len(df_positive)
required_rows = 3 * 4000

if total_positive_rows < required_rows:
    raise ValueError(f"Not enough rows with Prediction = 1. Available: {total_positive_rows}, Required: {required_rows}")

# Shuffle the filtered dataframe
df_positive_shuffled = df_positive.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the shuffled dataframe into three sets
set1 = df_positive_shuffled.iloc[:4000]
set2 = df_positive_shuffled.iloc[4000:8000]
set3 = df_positive_shuffled.iloc[8000:12000]

# Optionally, save these sets to separate CSV files
set1.to_csv('set1_apache.csv', index=False)
set2.to_csv('set2_apache.csv', index=False)
set3.to_csv('set3_apache.csv', index=False)

print("Sets created and saved successfully.")


Sets created and saved successfully.


Number of rows in Positive.csv: 32623
Number of rows in set1_apache.csv: 4000
Number of rows in set2_apache.csv: 4000
Number of rows in set3_apache.csv: 4000
Number of remaining rows: 32623
Remaining rows created and saved successfully.


In [7]:
import pandas as pd

# Load the original CSV file
df = pd.read_csv('withAuthor_comments_apache.csv')

# Filter rows where Prediction column is 1
df_positive = df[df['prediction'] == 1]

# Load the existing sets from their CSV files
set1 = pd.read_csv('set1_apache.csv')
set2 = pd.read_csv('set2_apache.csv')
set3 = pd.read_csv('set3_apache.csv')

# Combine the three sets into one dataframe
combined_sets = pd.concat([set1, set2, set3])

# Find the remaining rows in df_positive that are not in the combined sets
remaining_rows = df_positive.merge(combined_sets, how='outer', indicator=True).loc[lambda x: x['_merge'] == 'left_only']

# Drop the merge indicator column
remaining_rows = remaining_rows.drop(columns=['_merge'])

# Optionally, save the remaining rows to a new CSV file
remaining_rows.to_csv('remaining_rows_apache.csv', index=False)

# Print the row counts for each set
print(f"Number of rows in Positive.csv: {len(df_positive)}")
print(f"Number of rows in set1_apache.csv: {len(set1)}")
print(f"Number of rows in set2_apache.csv: {len(set2)}")
print(f"Number of rows in set3_apache.csv: {len(set3)}")
print(f"Number of remaining rows: {len(remaining_rows)}")

print("Remaining rows created and saved successfully.")


Number of rows in Positive.csv: 32623
Number of rows in set1_apache.csv: 4000
Number of rows in set2_apache.csv: 4000
Number of rows in set3_apache.csv: 4000
Number of remaining rows: 18719
Remaining rows created and saved successfully.


In [19]:
import csv

# Open the file in read mode and get the last values
with open('set3_apache_answer.txt', 'r') as file:
    last_values = [int(line.strip().split(':')[-1]) for line in file]

print(len(last_values))    
    
# Read the existing CSV file and store its content
with open('set3_apache.csv', 'r', newline='') as csvfile:
    reader = csv.reader(csvfile)
    rows = list(reader)

# Add the new header for the column
if rows:
    rows[0].append("actual")  # Add new header to the first row

# Append the last values as a new column to each row
for i, row in enumerate(rows[1:]):  # Skip the header row
    # Ensure we have enough last_values to add to each row
    if i < len(last_values):
        row.append(last_values[i])
    else:
        row.append('')  # Handle case where there are fewer last_values than rows

# Write the updated content to a new CSV file
with open('updated_set3_apache.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(rows)

print("The list has been added as a new column in the new CSV file.")


3999
The list has been added as a new column in the new CSV file.


In [20]:
import pandas as pd

# List of CSV files to merge
csv_files = ['updated_set3_apache.csv', 'updated_set1_apache.csv', 'updated_set2_apache.csv', 'updated_remaining_rows_apache.csv']

# Read and concatenate CSV files
dataframes = [pd.read_csv(file) for file in csv_files]
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_all_apache.csv', index=False)


In [22]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('withAuthor_comments_apache.csv')

# Filter rows where Prediction column is 1
df_positive = df[df['prediction'] == 0]
df_positive.to_csv('rowsWith0_apache.csv', index=False)