In [23]:
# exp2 labels to csv file
# names in format exp2\labels\3-27-2023_14-54-34_XX.txt
# save the number of rows in one text file to csv as one entry 
# if the name of the file is missing (for example there is no file for 3-27-2023_14-54-34_4.txt), the row should be 3-27-2023_14-54-34_4.txt,0


import os
import csv

# path to the labels folder
path = 'exp7/labels'

# get all the files in the folder
files = os.listdir(path)

# create a csv file
with open('exp7_labels.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # write the header
    writer.writerow(['filename', 'number_of_rows'])
    # write the rows
    for file in files:
        # get the number of rows in the file
        with open(os.path.join(path, file)) as f:
            rows = len(f.readlines())
        # write the row
        writer.writerow([file, rows])

# check the csv file
with open('exp7_labels.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)

['filename', 'number_of_rows']
['new_building_1.txt', '2']
['new_building_10.txt', '2']
['new_building_100.txt', '3']
['new_building_101.txt', '4']
['new_building_102.txt', '3']
['new_building_103.txt', '7']
['new_building_104.txt', '7']
['new_building_105.txt', '2']
['new_building_106.txt', '1']
['new_building_107.txt', '1']
['new_building_109.txt', '3']
['new_building_11.txt', '1']
['new_building_111.txt', '1']
['new_building_112.txt', '5']
['new_building_114.txt', '2']
['new_building_116.txt', '2']
['new_building_117.txt', '1']
['new_building_118.txt', '3']
['new_building_119.txt', '1']
['new_building_12.txt', '1']
['new_building_120.txt', '1']
['new_building_121.txt', '1']
['new_building_125.txt', '8']
['new_building_127.txt', '3']
['new_building_128.txt', '6']
['new_building_13.txt', '2']
['new_building_130.txt', '7']
['new_building_131.txt', '3']
['new_building_132.txt', '1']
['new_building_133.txt', '8']
['new_building_134.txt', '2']
['new_building_135.txt', '2']
['new_building_

In [24]:
# read in exp2_labels.csv 

import pandas as pd
df = pd.read_csv('exp7_labels.csv')
df.head()

# split file name into 

Unnamed: 0,filename,number_of_rows
0,new_building_1.txt,2
1,new_building_10.txt,2
2,new_building_100.txt,3
3,new_building_101.txt,4
4,new_building_102.txt,3


In [25]:
# sort the df by file name 
df.sort_values(by=['filename'], inplace=True)

# get the file name number before .txt
df['file_number'] = df['filename'].str.split('.').str[0].str.split('_').str[-1]
df.head()

Unnamed: 0,filename,number_of_rows,file_number
0,new_building_1.txt,2,1
1,new_building_10.txt,2,10
2,new_building_100.txt,3,100
3,new_building_101.txt,4,101
4,new_building_102.txt,3,102


In [26]:
#df['filename'] = df['filename'][1:]

In [27]:
# rank by file number 
df['file_number'] = df['file_number'].astype(int)
df.sort_values(by=['file_number'], inplace=True)

In [28]:
df.head()
# to csv 
df.to_csv('exp7_labels.csv', index=False)

In [29]:
# create a new df with 167 rows
# if file number is missing (doesnt match the row index), make the row 0

# create a new df with 167 rows
df_new = pd.DataFrame(columns=['filename', 'number_of_rows', 'file_number'])
df_new.head()

# create a list of file numbers
file_numbers = list(range(1, 168))

# add file numbers to the new df
df_new['file_number'] = file_numbers
df_new.head()

# merge the two dfs
df_merge = pd.merge(df_new, df, how='left', on='file_number')
df_merge.head()

# fill in the missing values with 0
df_merge.fillna(0, inplace=True)
df_merge.head()

# drop the file_number column
df_merge.drop(columns=['file_number'], inplace=True)
df_merge.head(50)

# save to csv
df_merge.to_csv('exp7_labels.csv', index=False)


In [18]:
# read in gpr_file 
import pandas as pd
import numpy as np
df = pd.read_csv('gpr_file.csv', header=None)

In [19]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,1,1,1,0.0,"04/24/2023, 19:22:45",2
1,2,1,0,100.0,"04/24/2023, 19:23:45",2
2,2,0,0,0.0,"04/24/2023, 19:24:45",0
3,4,2,2,0.0,"04/24/2023, 19:25:45",0
4,7,3,1,66.666667,"04/24/2023, 19:26:45",0


In [30]:
# compute second and last column Mean Absolute Percentage Error (MAPE)
# MAPE = abs((actual - predicted) / actual) * 100
# MAPE = abs((actual - predicted) / actual) * 100

df['MAPE'] = np.where(df[2] != 0, abs((df[1] - df[2]) / df[2]), 0)

In [32]:
df['MAPE'].mean()

0.2680115958559073

In [29]:
# t test for the two columns 
from scipy import stats
stats.ttest_ind(df[1], df[2])

Ttest_indResult(statistic=1.9284655016054222, pvalue=0.05464965429793361)

In [34]:
# use another data set to compute MAPE

# read in exp2_labels.csv
df = pd.read_csv('exp2_labels.csv')
df.head()

Unnamed: 0,filename,number_of_rows
0,3-27-2023_14-54-34_1.txt,7
1,3-27-2023_14-54-34_10.txt,4
2,3-27-2023_14-54-34_11.txt,2
3,3-27-2023_14-54-34_12.txt,1
4,3-27-2023_14-54-34_13.txt,2


In [35]:
df['filename'] = df['filename'].str.replace(r'\.txt$', '')
df['filename'] = df['filename'].str.replace(r'^\d+-\d+-\d+_\d+-\d+-\d+_','')

  df['filename'] = df['filename'].str.replace(r'\.txt$', '')
  df['filename'] = df['filename'].str.replace(r'^\d+-\d+-\d+_\d+-\d+-\d+_','')


In [36]:
df.head()

Unnamed: 0,filename,number_of_rows
0,1,7
1,10,4
2,11,2
3,12,1
4,13,2


In [39]:
# filename to int
df['filenum'] = df['filename'].astype(int)
# rank by file number
df.sort_values(by=['filenum'], inplace=True)

In [50]:
# read in converted1_gpr.csv
df_gpr = pd.read_csv('converted1_gpr.csv', header=None)

In [52]:
df

Unnamed: 0,filename,number_of_rows,filenum
0,1,7,1
10,2,6,2
21,3,5,3
36,5,5,5
40,6,2,6
45,7,3,7
48,8,1,8
49,9,4,9
1,10,4,10
2,11,2,11


In [51]:
df_gpr.head()

Unnamed: 0,0,1,2,3,4
0,64,64,39.0,39.0625,"04/24/2023, 18:30:12"
1,120,56,23.0,58.928571,"04/24/2023, 18:31:12"
2,183,63,41.0,34.920635,"04/24/2023, 18:32:12"
3,240,57,25.0,56.140351,"04/24/2023, 18:33:12"
4,307,67,45.0,32.835821,"04/24/2023, 18:34:12"
