In [None]:
import pandas as pd
import numpy as np
import chardet
import re
import statistics
import math
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import t, binom, norm, ks_2samp, ttest_ind, permutation_test
import random
from docx import Document as WordDocument
import pickle
import warnings
warnings.filterwarnings("ignore")


class Python_Scroll:
    def __init__(self, file_path):
        ext = file_path.lower().split('.')[-1]

        if ext == 'txt':
            self.data = self.extract_text_from_txt(file_path)
        elif ext == 'docx':
            docx_text = self.extract_text_from_docx(file_path)
            self.data = pd.DataFrame({'Text': docx_text.split('\n')})
        else:
            raise ValueError("Unsupported file format")

    def extract_text_from_txt(self, file_path):
        with open(file_path, 'rb') as file:
            raw_data = file.read()
            result = chardet.detect(raw_data)
            encoding = result['encoding']
        return pd.read_csv(file_path, delimiter='\t', encoding=encoding, header=None)

    def extract_text_from_docx(self, docx_file):
        doc = WordDocument(docx_file)
        docx_text = []
        for paragraph in doc.paragraphs:
            docx_text.append(paragraph.text)
        return '\n'.join(docx_text)

#####################
### Preprocessing ###
#####################

# Data cleaning
    def clean_data(self, save=0):
        # Define a function to remove characters with big dots and the tab character
        def remove_big_dots_and_tab(text):
            cleaned_text = ''
            for char in text:
                if char == '֯' or char == '\u200e':
                    cleaned_text = cleaned_text[:-1]
                    continue
                else:
                    cleaned_text += char
            return cleaned_text

        # Remove characters with big dots and the following tab character (like ת֯\t)
        self.data = self.data.applymap(remove_big_dots_and_tab)
        
        # Clean the data by removing content between '^' symbols
        self.data = self.data.applymap(lambda x: x.replace('?', ''))  # Remove '?'
        self.data = self.data.applymap(lambda x: x.replace('.', ''))  # Remove '.
        self.data = self.data.applymap(lambda x: re.sub(r'\{{.*?\}}', '', str(x)))  # Remove {{***}}
        self.data = self.data.applymap(lambda x: re.sub(r'\⟦.*?\⟧', '', str(x))) # remove ⟦***⟧
        self.data = self.data.applymap(lambda x: re.sub(r'\[.*?\]', '', str(x))) # Remove [***]

        # Remove characters with little dots above them (like נׄ), keeping the character itself
        self.data = self.data.applymap(lambda x: re.sub(r'\u05C4', '', str(x)))

        #self.data = self.data.applymap(lambda x: x.replace('\xa0', ''))  # Remove '\xa0'
        self.data = self.data.applymap(lambda x: re.sub(r'\[.*?\]', '', str(x)))
        
        self.original_data = self.data.copy() # create a copy for corrections computation
        self.data = self.data.applymap(lambda x: re.sub(r'\^.*?\^', '', str(x))) # Remove ^***^

        # Create new index columns based on the specified format
        index_cols = self.data.iloc[:, -1].str.extract(r'(\d+):(\d+)', expand=True)
        new_index_1 = index_cols[0]
        new_index_2 = index_cols[1]

        # Remove the "*:*" index from the lines
        self.data.iloc[:, -1] = self.data.iloc[:, -1].str.replace(r'\d+:\d+', '', regex=True).str.strip()

        # Reset the index to integer values
        self.data = self.data.reset_index(drop=True)

        # Set the new indexes to the existing data table
        self.data.index = pd.MultiIndex.from_arrays([new_index_1, new_index_2])

        # Reset the column names
        self.data.columns = range(self.data.shape[1])

        # Save the cleaned data to an Excel file using 'utf-8' encoding
        if save == 1:    
            self.save_to_word()
            
    def save_to_word(self):
        document = WordDocument()

        # Iterate through the data and add each cell as a paragraph
        for i in range(self.data.shape[0]):
            row = self.data.iloc[i].tolist()
            for cell in row:
                if pd.notna(cell):
                    document.add_paragraph(str(cell))
                else:
                    document.add_paragraph("")  # Add an empty paragraph for NaN values

        # Save the document to a Word file
        document.save("Cleaned_data.docx")

    def load_data(self, path):
        self.t_data = pd.read_csv(path, delimiter='\t', header=None, encoding='latin-1')
        level_labels = ['Level1'] * len(self.data) + ['Level2'] * len(self.data)
        self.t_data.index = pd.MultiIndex.from_tuples(zip(level_labels, self.data.index))
        self.data = self.t_data

    def __str__(self):
        return str(self.data)

# Creating a new table of appearance of each word in plane and defective spelling

    def word_counts(self, word_triplets_list):
        # Initialize a new table to store the word counts
        self.word_counts_table = pd.DataFrame()

        for i, word_triplet in enumerate(word_triplets_list):
            max_chars = 0
            words = word_triplet[0]
            max_chars = word_triplet[1] if len(word_triplet) > 1 else 0
            allowed_chars = word_triplet[2] if len(word_triplet) > 2 else ()
            word1, word2 = words

            word1_column = f'{word1}'
            word2_column = f'{word2}'

            word1_counts = pd.DataFrame(columns=['col_1'], index=range(len(self.data.index.levels[0])))
            word2_counts = pd.DataFrame(columns=['col_2'], index=range(len(self.data.index.levels[0])))

            # Access data at the specified level using .xs
            for high_index in self.data.index.levels[0]:
                values = self.data.xs(high_index, level=0).squeeze().tolist()

                defective_spelling_count = 0
                plane_spelling_count = 0

                for value in values:
                    # Split value into words
                    values_words = [word.strip() for word in value.split()]
                    for i, value_word in enumerate(values_words):
                        value_word = value_word.replace('\u200F', '')
                        len_value_word = len(value_word)
                        if value_word.endswith(word1):
                            if len_value_word == len(word1):
                                defective_spelling_count += 1
                            # the checking word is larger but in a possible length
                            elif (len_value_word - len(word1) <= max_chars) and (len_value_word - len(word1) > 0):
                                if allowed_chars != ():
                                    if value_word.startswith(allowed_chars):
                                        defective_spelling_count += 1
                                else: # allowed_chars not specified, then all chars are premitted
                                    defective_spelling_count += 1
                                    
                        elif value_word.endswith(word2):
                            if len_value_word == len(word2):
                                plane_spelling_count += 1
                            # the checking word is larger but in a possible length
                            elif (len_value_word - len(word2) <= max_chars) and (len_value_word - len(word2) > 0):
                                if allowed_chars != ():
                                    if value_word.startswith(allowed_chars):
                                        plane_spelling_count += 1
                                else: # allowed_chars not specified, then all chars are premitted
                                    plane_spelling_count += 1
                                    
                word1_counts.iat[int(high_index) - 1, 0] = defective_spelling_count
                word2_counts.iat[int(high_index) - 1, 0] = plane_spelling_count

            self.word_counts_table[word1_column] = word1_counts
            self.word_counts_table[word2_column] = word2_counts
            
# Concate new rows to table of words count (self.data)
    def join_dataframes(self, file_path):
        df = pd.read_excel(file_path)
        self.word_counts_table = pd.concat([self.word_counts_table, df], axis=1)

# Create a table for the statistical tests resalts
    def create_tests_table(self, var_test=False, row_test=False, count_corrections=False):
        # checking how many more columns to add (var and num of row tests)
        boolean_vars = var_test + row_test + count_corrections
        length = (len(self.word_counts_table.columns) // 2 + boolean_vars)
        self.table = pd.DataFrame(columns=range(length))


#####################
# Statistical tests #
#####################  
    def TestDifference(self, start_1, stop_1, start_2, stop_2, test_name ='', p_thresh = 0.1, char_corrections='^'):
        print(test_name)
        #p_thresh = p_thresh/2
        n_success_t = 0
        n_success_p = 0
        self.rellavent_data = self.word_counts_table
        columns = self.rellavent_data.columns[:]
        for i in range(0, len(self.word_counts_table.columns) , 2):
            col_1, col_2 = columns[i], columns[i+1]
            t_value = self.WaldTest(col_1, col_2, start_1, stop_1, start_2, stop_2)
            if t_value:
                p_value = self.PermuteWald(col_1, col_2, start_1, stop_1, start_2, stop_2, t_value, n_iter = 1000)
                print("Column Names:", col_1, col_2)
                print("t_value: ", t_value)
                print("p_value: ", p_value)
                if t_value < p_thresh:
                    n_success_t += 1
                if p_value:
                    if p_value < p_thresh:
                        n_success_p += 1
        
        print("Row Variance Test:")
        self.compute_row_var()
        t_value = self.student_t_test(self.var_table, start_1, stop_1, start_2, stop_2)
        p_value = self.permutation_1_col(self.var_table, start_1, stop_1, start_2, stop_2)
        print("t_value: ", t_value)
        print("p_value: ", p_value)
        if t_value:
            if t_value < p_thresh:
                n_success_t += 1
        if p_value:
            if p_value < p_thresh:
                n_success_p += 1

        print("Number of Rows Test:")
        self.compute_column_num_of_rows()
        t_value = self.student_t_test(self.num_of_rows_column, start_1, stop_1, start_2, stop_2)
        p_value = self.permutation_1_col(self.num_of_rows_column, start_1, stop_1, start_2, stop_2)
        print("t_value: ", t_value)
        print("p_value: ", p_value)
        if t_value:
            if t_value < p_thresh:
                n_success_t += 1
        if p_value:
            if p_value < p_thresh:
                n_success_p += 1

        print("Number of Corrections Test:")
        self.count_corrections(char_corrections)
        t_value = self.student_t_test(self.corrections, start_1, stop_1, start_2, stop_2)
        p_value = self.permutation_1_col(self.corrections, start_1, stop_1, start_2, stop_2)
        if t_value:
            if t_value < p_thresh:
                n_success_t += 1
        if p_value:
            if p_value < p_thresh:
                n_success_p += 1
        
        return n_success_t, n_success_p
    
    def PermuteTestDifference(self, null_range, start, stop, n_iter = 100, test_name ='', p_thresh = 0.1, char_corrections='^'):
        n_sucesses = np.zeros((2, n_iter))
        for j in range(n_iter):
            if j%100 == 0:
                print(f"Wald permutation test num: {j}")
                continue
            permuted_indices = np.random.permutation(null_range)
            self.rellavent_data = self.word_counts_table.iloc[permuted_indices]
        return
    
        
    def WaldTest_WithData(self, col_1_data_1, col_2_data_1, col_1_data_2, col_2_data_2):
        
        n_obs_1 = col_1_data_1.sum() + col_2_data_1.sum()
        n_obs_2 = col_1_data_2.sum() + col_2_data_2.sum()
        
        if n_obs_1 == 0 or n_obs_2 == 0:
            return ''            
        
        p_1 = col_1_data_1.sum()/n_obs_1
        p_2 = col_1_data_2.sum()/n_obs_2
        p_h = (n_obs_1*p_1 + n_obs_2*p_2) / (n_obs_1+n_obs_2)

        sd_h = math.sqrt(p_h*(1-p_h)*(1/n_obs_1 + 1/n_obs_2))
        if sd_h == 0:
            return ''
        
        degrees_of_freedom = n_obs_1 + n_obs_2 -2
        if degrees_of_freedom + 2 >= 30:
            return self.update_p_value(norm.cdf((p_2 - p_1) / sd_h))
        else:
            return self.update_p_value(t.cdf((p_2 - p_1) / sd_h, degrees_of_freedom))
        
    def WaldTest(self, col_1, col_2, start_1, stop_1, start_2, stop_2):
        col_1_data_1 = self.read_col(col_1, start_1, stop_1)
        col_2_data_1 = self.read_col(col_2, start_1, stop_1)
        col_1_data_2 = self.read_col(col_1, start_2, stop_2)
        col_2_data_2 = self.read_col(col_2, start_2, stop_2)
        return self.WaldTest_WithData(col_1_data_1, col_2_data_1, col_1_data_2, col_2_data_2)


    def PermuteWald(self, col_1, col_2, start_1, stop_1, start_2, stop_2, t_value, n_iter = 1000):
        tests_results = []
        indexes_to_shuffle = np.concatenate((np.arange(start_1-1, stop_1), np.arange(start_2-1, stop_2)) )
        n_obs_1 = stop_1 - start_1
        #n_obs_2 = stop_2 - start_2
        for j in range(n_iter):
            if j%100 == 0:
                #print(f"Wald permutation test num: {j}")
                continue
            permuted_indices = np.random.permutation(indexes_to_shuffle)
            col_1_data = self.read_col_at_indices(col_1, permuted_indices)
            col_2_data = self.read_col_at_indices(col_2, permuted_indices)
            col_1_data_1 = col_1_data[:n_obs_1+1]
            col_2_data_1 = col_2_data[:n_obs_1+1]
            col_1_data_2 = col_1_data[n_obs_1+1:]
            col_2_data_2 = col_2_data[n_obs_1+1:]
            statistic_value = self.WaldTest_WithData(col_1_data_1, col_2_data_1, col_1_data_2, col_2_data_2)
            if statistic_value:
                tests_results.append(statistic_value)

        self.rellavent_data = self.word_counts_table
        if len(tests_results)==0:
            return ''     
        return sum(1 for num in tests_results if num <= t_value) / len(tests_results)

    def PermuteWald_WithData(self, col_1_data, col_2_data, start_1, stop_1, start_2, stop_2, t_value, n_iter = 1000):
        tests_results = []
        indexes_to_shuffle = np.concatenate((np.arange(start_1-1, stop_1), np.arange(start_2-1, stop_2)) )
        n_obs_1 = stop_1 - start_1
        #n_obs_2 = stop_2 - start_2
        for j in range(n_iter):
            if j%100 == 0:
                #print(f"Wald permutation test num: {j}")
                continue
            permuted_indices = np.random.permutation(indexes_to_shuffle)
            col_1_data_shuffled = col_1_data[permuted_indices]
            col_2_data_shuffled = col_2_data[permuted_indices]
            col_1_data_1 = col_1_data_shuffled[:n_obs_1+1]
            col_2_data_1 = col_2_data_shuffled[:n_obs_1+1]
            col_1_data_2 = col_1_data_shuffled[n_obs_1+1:]
            col_2_data_2 = col_2_data_shuffled[n_obs_1+1:]
            statistic_value = self.WaldTest_WithData(col_1_data_1, col_2_data_1, col_1_data_2, col_2_data_2)
            if statistic_value:
                tests_results.append(statistic_value)

        self.rellavent_data = self.word_counts_table
        if len(tests_results)==0:
            return ''     
        return sum(1 for num in tests_results if num <= t_value) / len(tests_results)    
        
# Calculation of the statistics for the test
    def orgenizing_statistical_tests(self, start_row, stop_row, start_writer_h0, end_writer_h0, type_of_test, test_name ='', var_test=False, row_test=False, count_corrections=False, char_corrections=''):
        self.rellavent_data = self.word_counts_table
        self.statistical_tests(start_row, stop_row, start_writer_h0, end_writer_h0, type_of_test, test_name, var_test, row_test, count_corrections, char_corrections)
        self.export_table_statistical_test()
        
    def statistical_tests(self, start_row, stop_row, start_writer_h0, end_writer_h0, type_of_test, test_name, var_test, row_test, count_corrections, char_corrections):
        columns = self.rellavent_data.columns[:]  # Exclude the first column
        data = [np.nan] * len(self.table.columns)
        new_row = pd.Series(data, index=self.table.columns, name=test_name)
        #self.table = self.table.append(new_row)
        self.table = pd.concat([self.table, pd.DataFrame([new_row])], ignore_index=True)

        for i in range(0, len(self.word_counts_table.columns) , 2):
            col_1, col_2 = columns[i], columns[i+1]
            if type_of_test == 't_test':
                p_value = self.t_test(col_1, col_2, start_row, stop_row, start_writer_h0, end_writer_h0)
            elif type_of_test == 'permutation':
                t_value = self.t_test(col_1, col_2, start_row, stop_row, start_writer_h0, end_writer_h0)
                p_value = self.hand_permutation_test(col_1, col_2, start_row, stop_row, start_writer_h0, end_writer_h0, t_value)
            else: 
                raise('An incorrect test type was entered in the function')

            p_value = self.update_p_value(p_value)
            #print(p_value)
            self.table.iloc[-1, i//2] = p_value
            self.table.rename(columns={self.table.columns[i//2]: col_1}, inplace=True)
    
# Additional tests if chekcing row's var or/and num of rows tests
        if var_test == True:
            self.compute_row_var()
            if type_of_test == 't_test': 
                p_value = self.student_t_test(self.var_table, start_row, stop_row, start_writer_h0, end_writer_h0)
            elif type_of_test == 'permutation':
                p_value = self.permutation_1_col(self.var_table, start_row, stop_row, start_writer_h0, end_writer_h0)
            p_value = self.update_p_value(p_value)
            self.table.iloc[-1, i//2 + var_test] = p_value
            self.table.rename(columns={self.table.columns[i//2 +var_test]: 'row_variance'}, inplace=True)

        if row_test == True:
            self.compute_column_num_of_rows()
            if type_of_test == 't_test': 
                p_value = self.student_t_test(self.num_of_rows_column, start_row, stop_row, start_writer_h0, end_writer_h0)
            elif type_of_test == 'permutation':
                p_value = self.permutation_1_col(self.num_of_rows_column, start_row, stop_row, start_writer_h0, end_writer_h0)
            p_value = self.update_p_value(p_value)
            self.table.iloc[-1, i//2 + var_test + row_test] = p_value
            self.table.rename(columns={self.table.columns[i//2 + var_test + row_test]: 'num_of_rows'}, inplace=True)

        if count_corrections == True:
            self.count_corrections(char_corrections)
            if type_of_test == 't_test': 
                p_value = self.student_t_test(self.corrections, start_row, stop_row, start_writer_h0, end_writer_h0)
            elif type_of_test == 'permutation':
                p_value = self.permutation_1_col(self.corrections, start_row, stop_row, start_writer_h0, end_writer_h0)
            p_value = self.update_p_value(p_value)
            self.table.iloc[-1, i//2 + var_test + row_test + count_corrections] = p_value
            self.table.rename(columns={self.table.columns[i//2 + var_test + row_test + count_corrections]: 'overall corrections'}, inplace=True)

    def read_col(self, col, start, stop):
        return self.rellavent_data[col].iloc[start:stop+1]
    def read_col_at_indices(self, col, indices):
        return self.rellavent_data[col].iloc[indices]
# Computing the odds to recieve the empirical results assuming 
    # T distribution
    def t_test(self, col_1, col_2, start_row, stop_row, start_writer_h0, end_writer_h0):
        col_1_data_h0 = self.read_col(col_1, start_writer_h0, end_writer_h0)
        col_2_data_h0 = self.read_col(col_2, start_writer_h0, end_writer_h0)
        col_1_data = self.read_col(col_1, start_row, stop_row)
        col_2_data = self.read_col(col_2, start_row, stop_row)
        
        n_obs_h0 = col_1_data_h0.sum() + col_2_data_h0.sum()
        n_obs = col_1_data.sum() + col_2_data.sum()
        
        # p_h = self.calculate_mean(col_1, col_2, start_row, stop_row)
        # p_h0 = self.calculate_mean(col_1, col_2, start_writer_h0, end_writer_h0)
        # #var_0 = self.empirical_var(col_1, col_2, start_writer_h0, end_writer_h0)
        # if start_writer_h0 == start_row:
        #     p_h1 = self.calculate_mean(col_1, col_2, end_writer_h0+1, stop_row)
        #     #var_1 = self.empirical_var(col_1, col_2, end_writer_h0+1, stop_row)
        # else:
        #     p_h1 = self.calculate_mean(col_1, col_2, start_row, start_writer_h0-1)
        #     #var_1 = self.empirical_var(col_1, col_2, start_row, start_writer_h0-1)
        # # if var not defined then return '' 
        
        # num_obs_h0 = self.calculate_sum(col_1, col_2, start_writer_h0, end_writer_h0)
        # num_obs_h1 = self.calculate_sum(col_1, col_2, start_row, stop_row) - num_obs_h0
        if n_obs_h0 == 0 or n_obs == 0:
            return ''            
        
        p_h0 = col_1_data_h0.sum()/n_obs_h0
        p = col_1_data.sum()/n_obs
        
        sd_h = math.sqrt(p*(1-p)*(1/n_obs_h0 + 1/n_obs))
        if sd_h == 0:
            return ''
        
        # joined_sd = math.sqrt(((num_obs_h0 - 1)*var_0 + (num_obs_h1 - 1)*var_1) / 
        #             (num_obs_h0 + num_obs_h1 - 2))
        # joined_sd_for_means = joined_sd * math.sqrt(1/num_obs_h0 + 1/num_obs_h1)
        
        # degrees_of_freedom = self.calculate_sum(col_1, col_2, start_row, stop_row)-2
        return norm.cdf((p - p_h0) / sd_h)

    
    # compute permutation distribution
    def hand_permutation_test(self, col_1, col_2, start_row, stop_row, start_writer_h0, end_writer_h0, t_value):
        tests_results = []
        for j in range(NUM_PERMUTATIONS):
            #print(f"Hypothesis permutatuin test num: {j}")
            self.rellavent_data = self.word_counts_table.copy()            
            self.rellavent_data = self.rellavent_data.loc[:, [col_1, col_2]]
            
            indexes_to_shuffle = np.arange(start_row-1, stop_row)
            np.random.shuffle(indexes_to_shuffle)
            self.rellavent_data.iloc[start_row-1:stop_row, :] = self.word_counts_table.loc[indexes_to_shuffle, [col_1, col_2]]
            p_value = self.t_test(col_1, col_2, start_row, stop_row, start_writer_h0, end_writer_h0)
            if p_value:
                tests_results.append(p_value)

        self.rellavent_data = self.word_counts_table
        if len(tests_results)==0:
            return ''     
        return sum(1 for num in tests_results if num <= t_value) / len(tests_results)
    
# calculate p = the empirical parameter of bernoli distribution
    def calculate_mean(self, col_1, col_2, start_writer_h0, end_writer_h0):
        col_1_data = self.rellavent_data[col_1].iloc[start_writer_h0-1:end_writer_h0].tolist()
        col_2_data = self.rellavent_data[col_2].iloc[start_writer_h0-1:end_writer_h0].tolist()
        if self.calculate_sum(col_1, col_2, start_writer_h0, end_writer_h0) != 0:
            return sum(col_1_data) / self.calculate_sum(col_1, col_2, start_writer_h0, end_writer_h0)
        else:
            return 'division by zero'

# calculate N = the number of appernace of the rellavent word in both first and seconed writers
    def calculate_sum(self, col_1, col_2, start_writer_h0, end_writer_h0):
        col_1_data = self.rellavent_data[col_1].iloc[start_writer_h0-1:end_writer_h0].tolist()
        col_2_data = self.rellavent_data[col_2].iloc[start_writer_h0-1:end_writer_h0].tolist()
        return sum(col_1_data) + sum(col_2_data)

# calculate N_1 = the number of appernace of the rellavent word in 1 format only
    def calculate_success(self, col_1, start_writer_h0, end_writer_h0):
        col_1_data = self.rellavent_data[col_1].iloc[start_writer_h0-1:end_writer_h0].tolist()
        return sum(col_1_data)

# Calculate s.d. for givven columns
    def empirical_var(self, col_1, col_2, start_writer_h0, end_writer_h0):
        if self.calculate_sum(col_1, col_2, start_writer_h0, end_writer_h0)-1 == 0 or self.calculate_mean(col_1, col_2, start_writer_h0, end_writer_h0)=='division by zero':
            return ''
        meanning_the_sum_of_squers = 1 / (self.calculate_sum(col_1, col_2, start_writer_h0, end_writer_h0)-1)
        sum_of_squers_for_success_columns = sum([i*(1 - self.calculate_mean(col_1, col_2, start_writer_h0, end_writer_h0)) ** 2 for i in self.rellavent_data[col_1].iloc[start_writer_h0 - 1 : end_writer_h0].tolist()])                     
        sum_of_squers_for_unsuccess_columns = sum([i*(0 - self.calculate_mean(col_1, col_2, start_writer_h0, end_writer_h0)) ** 2 for i in self.rellavent_data[col_2].iloc[start_writer_h0 - 1 : end_writer_h0].tolist()])                     
        var = meanning_the_sum_of_squers*(sum_of_squers_for_success_columns+sum_of_squers_for_unsuccess_columns)
        return var
########################
# QUESTION : WHY DO YOU DO THAT?
# update p_value
    def update_p_value(self, p_value):
        if p_value == '':
            return ''
        elif p_value> 0.5:
            return 1-p_value
        return p_value
    
    def export_table_statistical_test(self):
        self.table.to_csv("Updated_Significance_Tests_Table_11.csv")

# Filling up the tables
    def create_table_permutation_tests(self, length=20):
        row_names = range(length + 1)
        self.permutation_table = pd.DataFrame(index=range(length + 1))
            
    def update_table_with_counts_proportion(self, test_name):
        # add an empty row to self.table
        data = [np.nan] * len(self.permutation_table.index)
        new_column = pd.Series(data, index=self.permutation_table.index, name=test_name)
        self.permutation_table = pd.concat([self.permutation_table, new_column], axis=1)
        
        # Count the occurrences of each number in self.tests_results
        counts = {i: self.tests_results.count(i) for i in range(len(self.permutation_table.index))}

        list_length = len(self.tests_results)

        # Update the values in the specified row of the table
        for row_idx, count in counts.items():
            self.permutation_table.at[row_idx, test_name] = count / list_length

    def export_table_permutaion_tests(self):
        self.table.to_csv("Updated_Significance_Tests_Table_11.csv")

######################
## row's length var ##
######################

# Functions for computing num of words and letters in a row
    def replace_with_letter_count(self):
        # Create a new DataFrame with the same shape as the original table
        new_data = pd.DataFrame(index=self.data.index, columns=self.data.columns)

        # Function to count letters in a string (excluding spaces)
        def count_letters(text):
            return len(re.sub(r'\s', '', str(text)))

        # Iterate over each cell in the original table
        for row_idx, row in self.data.iterrows():
            for col_idx, cell in row.items():
                # Count the letters in the cell (excluding spaces)
                letter_count = count_letters(cell)
                # Assign the letter count to the corresponding cell in the new table
                new_data.loc[row_idx, col_idx] = letter_count

        self.letters_rows_count = new_data

    def compute_row_var(self):
        # Calculate the mean and standard deviation for each group
        self.replace_with_letter_count()
        group_stats = self.letters_rows_count.groupby(level=0).agg(['var'])
        group_stats.index = group_stats.index.astype(int)
        
        # Sort the DataFrame based on the first index values
        self.var_table = group_stats.sort_index()

#####################
## rows in columns ##
#####################
    def compute_column_num_of_rows(self):
        # Calculate the mean and standard deviation for each group
        self.replace_with_letter_count()
        group_stats = self.letters_rows_count.groupby(level=0).agg(['count'])
        group_stats.index = group_stats.index.astype(int)
        
        # Sort the DataFrame based on the first index values
        self.num_of_rows_column = group_stats.sort_index()
        self.num_of_rows_column.at[self.num_of_rows_column.index[-1],0] = self.num_of_rows_column[self.num_of_rows_column.columns[0]].mean()

#####################
# count corrections #
#####################
    def count_corrections(self, char_corrections):
        
        # Apply the clean_text function to each element in the DataFrame
        self.data_t = self.data.copy()
        self.data = self.original_data.copy()
        
        # Create new index columns based on the specified format
        index_cols = self.data.iloc[:, -1].str.extract(r'(\d+):(\d+)', expand=True)
        new_index_1 = index_cols[0]
        new_index_2 = index_cols[1]

        # Remove the "*:*" index from the lines
        self.data.iloc[:, -1] = self.data.iloc[:, -1].str.replace(r'\d+:\d+', '', regex=True).str.strip()

        # Reset the index to integer values
        self.data = self.data.reset_index(drop=True)

        # Set the new indexes to the existing data table
        self.data.index = pd.MultiIndex.from_arrays([new_index_1, new_index_2])

        # Reset the column names
        self.data.columns = range(self.data.shape[1])
        
        self.corrections = self.count_num_of_corrections(char_corrections)
        self.data = self.data_t.copy()
        
    def count_num_of_corrections(self, char_corrections):                                     
        data = pd.DataFrame(columns=self.data.columns, index=range(1,len(self.data.index.levels[0])+1))

        for high_index in self.data.index.levels[0]:
            values = self.data.xs(high_index, level=0).squeeze().tolist()
            corrections = (sum(item.count(char_corrections) for item in values))/2
            data.iat[int(high_index) - 1, 0] = corrections
        return data



#####################
#### test 2 dist ####
#####################
    def student_t_test(self, data_to_test, start_row, stop_row, start_writer_h0, end_writer_h0):
        
        group_0 = data_to_test.iloc[start_writer_h0-1:end_writer_h0, 0].tolist()
        group_1 = data_to_test.iloc[start_row-1:stop_row, 0].tolist()
        # if start_writer_h0 == start_row:
        #     group_1 = data_to_test.iloc[end_writer_h0:stop_row, 0].tolist()
        # else:
        #     group_1 = data_to_test.iloc[start_row-1:start_writer_h0-1, 0].tolist()
        student_t_test_result = ttest_ind(a=group_0, b=group_1, equal_var=True, alternative='less')
        return student_t_test_result.pvalue
    
    def permutation_1_col(self, data_to_test, start_row, stop_row, start_writer_h0, end_writer_h0):
        group_0 = data_to_test.iloc[start_writer_h0-1:end_writer_h0, 0].tolist()
        group_1 = data_to_test.iloc[start_row-1:stop_row, 0].tolist()
        
        #group_1 = (data_to_test.iloc[end_writer_h0:stop_row, 0] if start_writer_h0 == start_row else data_to_test.iloc[start_row-1:start_writer_h0-1, 0]).tolist()

        # Remove empty values from the lists
        group_0 = [val for val in group_0 if not np.isnan(val)]
        group_1 = [val for val in group_1 if not np.isnan(val)]

        # Check if both lists are empty and return np.nan
        if not group_0 or not group_1:
            return np.nan

        if np.sum(group_0 == np.nan) > 0 or np.sum(group_1 == np.nan) > 0:
            print("Something Fishy")
        p_value = permutation_test(data=(group_0, group_1), statistic=self.t_test_for_permutation, permutation_type='independent',
                                   vectorized=True, n_resamples=1000, alternative='less', axis=0)

        # Convert back to np.nan if p_value is zero-dimensional
        if np.isscalar(p_value):
            p_value = np.nan

        return p_value.pvalue

    def t_test_for_permutation(self, x_list, y_list, axis):
        x_array = np.array(x_list)
        y_array = np.array(y_list)

        if len(x_array) < 1 or len(y_array) < 1:
            return np.nan  # Return np.nan instead of ''

        _, p_value = ttest_ind(x_array, y_array, axis=axis)
        return p_value
        
#####################
# permutation tests #
#####################  

#  function
    def permutation_tests(self, start_row, stop_row, start_writer_h0, end_writer_h0, type_of_test, bootstrap=0, test_name='', column_index_to_start=0, num_iterations=10000, var_test=False, row_test=False, count_corrections=False):
        self.counting_successful_tests(start_row, stop_row, start_writer_h0, end_writer_h0, type_of_test, bootstrap, column_index_to_start, num_iterations, var_test, row_test, count_corrections)
        self.update_table_with_counts_proportion(test_name)
        self.export_table_permutaion_tests()

# define the rows to be tested and the transaction in wrtiers
    def counting_successful_tests(self, start_row, stop_row, start_writer_h0, end_writer_h0, type_of_test, bootstrap, column_index_to_start, num_iterations, var_test, row_test, count_corrections):
        # shaffling the data 'num_iterations' times
        tests_results = []
        for j in range (num_iterations):
            print(j)
            counter = 0 
            indexes_to_shuffle = self.bootstrap_data(start_row, stop_row, start_writer_h0, end_writer_h0) if bootstrap == 1 else self.preprocess_data(start_row, stop_row)
            print(indexes_to_shuffle)
            # compute tests for each columns pair
            for i in range(column_index_to_start,len(self.rellavent_data.columns)-2,2):
                columns = self.rellavent_data.columns[column_index_to_start:]
                col_1, col_2 = columns[i], columns[i+1]
                if type_of_test == 't_test':
                    p_value = self.t_test(col_1, col_2, start_row, stop_row, start_writer_h0, end_writer_h0)
                elif type_of_test == 'permutation':
                    t_value = self.t_test(col_1, col_2, start_row, stop_row, start_writer_h0, end_writer_h0)
                    p_value = self.hand_permutation_test(col_1, col_2, start_row, stop_row, start_writer_h0, end_writer_h0, t_value)
                else: 
                    raise('An incorrect test type was entered in the function')
                p_value = self.update_p_value(p_value)
                if p_value == '':
                    continue 
                else:
                    if p_value <= 0.05:
                        counter += 1

            # to var of rows and num of rows            
            if var_test == True:
                self.copy_var_table = self.var_table.copy()
                self.copy_var_table.iloc[np.arange(start_row-1, stop_row), :] = self.var_table.iloc[indexes_to_shuffle, :]
                if type_of_test == 't_test': 
                    p_value = self.student_t_test(self.copy_var_table, start_row, stop_row, start_writer_h0, end_writer_h0)
                elif type_of_test == 'permutation':
                    p_value = self.permutation_1_col(self.copy_var_table, start_row, stop_row, start_writer_h0, end_writer_h0)
                p_value = self.update_p_value(p_value)
                if p_value <= 0.05:
                    counter += 1             
                
            if row_test == True:
                self.copy_num_of_rows_column = self.num_of_rows_column.copy()
                self.copy_num_of_rows_column.iloc[np.arange(start_row-1, stop_row), :] = self.num_of_rows_column.iloc[indexes_to_shuffle, :]
                if type_of_test == 't_test': 
                    p_value = self.student_t_test(self.copy_num_of_rows_column, start_row, stop_row, start_writer_h0, end_writer_h0)
                elif type_of_test == 'permutation':
                    p_value = self.permutation_1_col(self.copy_num_of_rows_column, start_row, stop_row, start_writer_h0, end_writer_h0)
                p_value = self.update_p_value(p_value)
                if p_value <= 0.05:
                    counter += 1
                    
            if count_corrections == True:
                self.copy_corrections = self.corrections.copy()
                self.copy_corrections.iloc[np.arange(start_row-1, stop_row), :] = self.corrections.iloc[indexes_to_shuffle, :]
                if type_of_test == 't_test': 
                    p_value = self.student_t_test(self.copy_corrections, start_row, stop_row, start_writer_h0, end_writer_h0)
                elif type_of_test == 'permutation':
                    p_value = self.permutation_1_col(self.copy_corrections, start_row, stop_row, start_writer_h0, end_writer_h0)
                p_value = self.update_p_value(p_value)
                if p_value <= 0.05:
                    counter += 1
                                    
            tests_results.append(counter)
        self.tests_results = tests_results
                  
            
                       
    def preprocess_data(self, start_row, stop_row):
        # choozing rows for rellavant tests
        self.rellavent_data = self.word_counts_table.copy()
        indexes_to_shuffle = np.arange(start_row-1, stop_row)
        np.random.shuffle(indexes_to_shuffle)
        self.rellavent_data.iloc[np.arange(start_row-1, stop_row), :] = self.word_counts_table.iloc[indexes_to_shuffle, :]
        
        # Computing row var and row num
        self.compute_row_var()
        self.compute_column_num_of_rows()
        
        return indexes_to_shuffle

    def bootstrap_data(self, start_row, stop_row, start_writer_h0, end_writer_h0, random_seed=None):
        num_samples = stop_row - start_row + 1
        if random_seed is not None:
            random.seed(random_seed)  # Set the random seed if provided

        # Sample row indexes within the specified range with replacement
        sampled_indexes = random.choices(range(start_writer_h0, end_writer_h0), k=num_samples)

        # Create a new DataFrame from the sampled rows
        self.rellavent_data = self.word_counts_table.iloc[sampled_indexes]

        # Return the sampled row indexes (with duplicates)
        return sampled_indexes




NUM_PERMUTATIONS = np.int16(1e3)
STAGE = 0#7
file_path = r"C:\Users\yisha\OneDrive\Documents\Thesis\DSS\1QIsaa 1-without_verses.docx"
concated_data_path = r"C:\Users\yisha\OneDrive\Documents\Thesis\DSS\DSS_Data_verbs_data.xlsx"
output_file = r"./Letters_count.xlsx"
if STAGE==0:
    data = Python_Scroll(file_path)
    data.clean_data() # can also save the new data
    #data.load_data('C:\\Users\\yisha\\PycharmProjects\\pythonProject\\Thesis\\Cleaned_data.txt')
    data.word_counts([(('כי','כיא'),1,('ו','ה')),(('לי','ליא'),1,('ו')), (('מי','מיא'),1,('מ','ב','ל','כ','ו')), (('כה','כוה'),1,('ו', 'ה')), (('הוא','הואה'),1,('ו', 'ה')), (('כל','כול'),2 ,('מ','ב','ל','כ','ו', 'ה')), (('ירושלם','ירושלים'),2)])
    data.join_dataframes(concated_data_path)
    data.create_tests_table(var_test=True, row_test=True, count_corrections=True)
    data.compute_row_var()
    data.compute_column_num_of_rows()
    n_success_t, n_success_p = data.TestDifference(start_1 = 1, stop_1 = 11, start_2 = 12, stop_2 = 27, test_name = "\n######\n1 vs 2\n######\n")
    print("Number of successes:\n t = ", n_success_t, "| p = ", n_success_p)
    n_success_t, n_success_p = data.TestDifference(start_1 = 28, stop_1 = 54, start_2 = 1, stop_2 = 11, test_name = "\n######\n1 vs 3\n######\n")
    print("Number of successes:\n t = ", n_success_t, "| p = ", n_success_p)
    n_success_t, n_success_p = data.TestDifference(start_1 = 28, stop_1 = 54, start_2 = 12, stop_2 = 27, test_name = "\n######\n2 vs 3\n######\n")
    print("Number of successes:\n t = ", n_success_t, "| p = ", n_success_p)
    data.orgenizing_statistical_tests(start_row=28, stop_row=54, start_writer_h0=12, end_writer_h0=27,
                                        type_of_test='t_test', test_name ='t_test_3_2',var_test=True, row_test=True, count_corrections=True, char_corrections='^')
    file = open('./Stage1', 'wb')
    pickle.dump(data, file)
    file.close()
else:
    file = open('./Stage'+str(STAGE), 'rb')
    pick_data = pickle.load(file)
    data = pick_data
    file.close()

if STAGE < 2:
    data.orgenizing_statistical_tests(start_row=1, stop_row=27, start_writer_h0=12, end_writer_h0=27,
                                        type_of_test='t_test', test_name ='t_test_1_2',var_test=True, row_test=True, count_corrections=True, char_corrections='^')
    file = open('./Stage2', 'wb')
    pickle.dump(data, file)
    file.close()
if STAGE < 3:
    data.orgenizing_statistical_tests(start_row=12, stop_row=54, start_writer_h0=12, end_writer_h0=27,
                                        type_of_test='permutation', test_name ='permutation_3_2',var_test=True, row_test=True, count_corrections=True, char_corrections='^')
    file = open('./Stage3', 'wb')
    pickle.dump(data, file)
    file.close()
if STAGE < 4:
    data.orgenizing_statistical_tests(start_row=1, stop_row=27, start_writer_h0=12, end_writer_h0=27,
                                        type_of_test='permutation', test_name ='permutation_1_2',var_test=True, row_test=True, count_corrections=True, char_corrections='^')
    file = open('./Stage4', 'wb')
    pickle.dump(data, file)
    file.close()
if STAGE < 5:
    data.orgenizing_statistical_tests(start_row=1, stop_row=27, start_writer_h0=28, end_writer_h0=54,
                                        type_of_test='permutation', test_name ='permutation_12_3',var_test=True, row_test=True, count_corrections=True, char_corrections='^')
    file = open('./Stage5', 'wb')
    pickle.dump(data, file)
    file.close()
if STAGE < 6:
    data.create_table_permutation_tests()
    data.permutation_tests(start_row=12, stop_row=54, start_writer_h0=12, end_writer_h0=27, type_of_test='t_test',
                            test_name='3:2_t_test', column_index_to_start=0, num_iterations=1000, var_test=True, row_test=True, count_corrections=True)
    file = open('./Stage6', 'wb')
    pickle.dump(data, file)
    file.close()
if STAGE < 7:
    data.permutation_tests(start_row=1, stop_row=27, start_writer_h0=12, end_writer_h0=27, type_of_test='t_test',
                            test_name='1:2_t_test', column_index_to_start=0, num_iterations=1000, var_test=True, row_test=True, count_corrections=True)
    file = open('./Stage7', 'wb')
    pickle.dump(data, file)
    file.close()
if STAGE < 8:
    data.permutation_tests(start_row=12, stop_row=54, start_writer_h0=12, end_writer_h0=27, type_of_test='permutation',
                            test_name='3:2_permutation', column_index_to_start=0, num_iterations=1000, var_test=True, row_test=True, count_corrections=True)
    file = open('./Stage8', 'wb')
    pickle.dump(data, file)
    file.close()
if STAGE < 9:
    data.permutation_tests(start_row=1, stop_row=27, start_writer_h0=12, end_writer_h0=27, type_of_test='permutation',
                            test_name='1:2__permutation', column_index_to_start=0, num_iterations=1000, var_test=True, row_test=True, count_corrections=True)
    file = open('./Stage9', 'wb')
    pickle.dump(data, file)
    file.close()

data.permutation_tests(start_row=1, stop_row=27, start_writer_h0=12, end_writer_h0=27, type_of_test='permutation',
                        test_name='1:2__permutation', column_index_to_start=0, num_iterations=1000, var_test=True, row_test=True, count_corrections=True)
file = open('./Stage10', 'wb')
pickle.dump(data, file)
file.close()


pd.set_option('display.max_rows', None)
pd.options.display.max_colwidth = None

first_row = data.data.iloc[(1,0)]  # Get the first row as a Series
print(first_row)

