From 2699884112bab74bcd46bde270314d80cc8067d7 Mon Sep 17 00:00:00 2001 From: Anna Roy Date: Sat, 4 Nov 2023 23:21:04 +1100 Subject: [PATCH 1/7] tab_df code --- tab_df/__int__.py | 0 tab_df/__pycache__/logics.cpython-39.pyc | Bin 0 -> 3569 bytes tab_df/display.py | 33 +++++++ tab_df/logics.py | 111 +++++++++++++++++++++++ 4 files changed, 144 insertions(+) create mode 100644 tab_df/__int__.py create mode 100644 tab_df/__pycache__/logics.cpython-39.pyc create mode 100644 tab_df/display.py create mode 100644 tab_df/logics.py diff --git a/tab_df/__int__.py b/tab_df/__int__.py new file mode 100644 index 0000000..e69de29 diff --git a/tab_df/__pycache__/logics.cpython-39.pyc b/tab_df/__pycache__/logics.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db968a84d9b718a0d58e4f6579e68f8cdf9d4463 GIT binary patch literal 3569 zcmb_e&2JM&6yMn|uRrn$4dJ8I4X9<+LRC&V36_ypUgnMP&bMZr^S`-Pvy5tZzSf+{5T|h#1UBv^<0Gd+I-2&Xsv%B;KM+kABo2Yjq0;hsLq- z#$d!*&dr=#Ik$7}U%Rl*~k zssU;>;&DrQw35^+%c@6OJz;zSF)M9qvV=n|3g|=2I1)q*=1u)4T2XjV^psY}P|DXX zVdv23;LQdGU(eXFi6iSx?tR+^UqJ!i=41wG_Cb0)3L$8T7s}Y2!gB^8U&U{{*-YGa^BT-Tq!5JOS)Zz0^l9gDX5rVXrMpx)S+Q(qS)zNX2>31JVH_(0| z)-t?z$Lv@~bbp(yt?%#f%P1vniW;qoQf_nZ;KV#^9J88n4O^}Sj}6a^V38K`^`~&n zScd#ntRpJcF`H4YoU6nSaN;Jmi4v#Lj$`W-+ObR3ae0pR z66x_6Bc5UDY-Q z*2YKbHS1WvvCKYX$`c)x(X9q1>OuE;#7~$k#c3-UK;*m1H8to3Ou|tM^HO%uj|8vd z?tFs?g_Uorbh2B^bB;>jv!Qzq1@B-Jp!_&T5A_WN)z6n4|-%pmTd4 z^WljGCRsWINP2CIbrSi3Q0at-sbGw0H2vP-WMQ0e3h=o|X!gmwkc@2OaOH8-?eT;9QxUeK z`nB)x`F>e@1HE`st$WCE3;i(E@ME4(bWlq7MVJh*x~g^6M!|qKnwwnC`kGP@r5DQmDp6BvYGdvECn<(5Ok6$iba`w}+XeFv6H^T-iji{KCOEH6 zEFj&6?+zCmD&f?7d-W$+DGpaAW34VRJU{ugHA=^zr8g#1w7UK>&i3n%bxm3SN$H`u z9Hprks*iSFn=IeKupNEJYO}f(i58Fh8J1bnQ9JQ(@v{6j5}&EG;d_ObP(3gzjZ-S* zluS4ojDJN7^60H$uJ4BYAOJM}Z1_}_M{0ox7YQ=0VY=2b^ZqmcbxV`yy&%~@FrU#8 z(`=RM+Z4Sd2xKJ)x;+?Ri5|#W5d1O-6FtMJw&NFxP(`J^jH(JwZ#bu?k<$ajDQR=c zTKo$lUlO@Rggl6E5}}H`K&X)9qnAifZG#nL4gV$2QGeHSv0tjV6{q5Qo|5$~>Gs1E aLZMYh>kE6TBXotjsV{VgqE*#op7}p^zVa>r literal 0 HcmV?d00001 diff --git a/tab_df/display.py b/tab_df/display.py new file mode 100644 index 0000000..e76d730 --- /dev/null +++ b/tab_df/display.py @@ -0,0 +1,33 @@ + +import streamlit as st +from logics import Dataset + + +def display_tab_df_content(file_path): + # Instantiate the Dataset class and set the data + dataset = Dataset(file_path) + dataset.set_data() + + st.title("Data Analysis - DataFrame") + + # Expander for displaying dataset summary + with st.expander("Dataset Summary"): + summary_data = dataset.get_summary() + st.table(summary_data) + + # Expander for selecting rows to display + with st.expander("Select Rows to Display"): + num_rows = st.slider("Select the number of rows to display", 5, 50, 5) + display_method = st.radio("Select the display method", ("head", "tail", "sample")) + + if display_method == "head": + st.dataframe(dataset.get_head(num_rows)) + elif display_method == "tail": + st.dataframe(dataset.get_tail(num_rows)) + else: + st.dataframe(dataset.get_sample(num_rows)) + + # Display column information + st.header("Column Information") + st.table(dataset.table) + diff --git a/tab_df/logics.py b/tab_df/logics.py new file mode 100644 index 0000000..6ec70c8 --- /dev/null +++ b/tab_df/logics.py @@ -0,0 +1,111 @@ +import pandas as pd + +class Dataset: + def __init__(self, file_path): + self.file_path = file_path + self.df = None + self.cols_list = [] + self.n_rows = 0 + self.n_cols = 0 + self.n_duplicates = 0 + self.n_missing = 0 + self.n_num_cols = 0 + self.n_text_cols = 0 + self.table = None + + def set_data(self): + self.set_df() + self.set_columns() + self.set_dimensions() + self.set_duplicates() + self.set_missing() + self.set_numeric() + self.set_text() + self.set_table() + +# Class method that will load the uploaded CSV file as Pandas DataFrame and store it as attribute (self.df) if it hasn't been provided before. + + def set_df(self): + if self.df is None: + self.df = pd.read_csv(self.file_path) + + +# Class method that checks if self.df is empty or none + + def is_df_none(self): + return self.df is None + +# Class method that extract the list of columns names and store the results in the relevant attribute (self.cols_list) if self.df is not empty nor None + + def set_columns(self): + if not self.is_df_none(): + self.cols_list = list(self.df.columns) + +# Class method that computes the dimensions (number of columns and rows) of self.df and store the results in the relevant attributes (self.n_rows, self.n_cols) if self.df is not empty nor None + def set_dimensions(self): + if not self.is_df_none(): + self.n_rows, self.n_cols = self.df.shape +# Class method that computes the number of duplicated of self.df and store the results in the relevant attribute (self.n_duplicates) if self.df is not empty nor None + + def set_duplicates(self): + if not self.is_df_none(): + self.n_duplicates = len(self.df) - len(self.df.drop_duplicates()) + +# Class method that computes the number of missing values of self.df and store the results in the relevant attribute (self.n_missing) if self.df is not empty nor None + + def set_missing(self): + if not self.is_df_none(): + self.n_missing = self.df.isnull().sum().sum() + +# Class method that computes the number of columns that are numeric type and store the results in the relevant attribute (self.n_num_cols) if self.df is not empty nor None + + def set_numeric(self): + if not self.is_df_none(): + numeric_cols = self.df.select_dtypes(include=['number']).columns + self.n_num_cols = len(numeric_cols) + +# Class method that computes the number of columns that are text type and store the results in the relevant attribute (self.n_text_cols) if self.df is not empty nor None + + def set_text(self): + if not self.is_df_none(): + text_cols = self.df.select_dtypes(include=['object']).columns + self.n_text_cols = len(text_cols) + +# Class method that computes the first rows of self.df according to the provided number of rows specified as parameter (default: 5) if self.df is not empty nor None + + def get_head(self, n=5): + if not self.is_df_none(): + return self.df.head(n) + +# Class method that computes the last rows of self.df according to the provided number of rows specified as parameter (default: 5) if self.df is not empty nor None + + def get_tail(self, n=5): + if not self.is_df_none(): + return self.df.tail(n) +# Class method that computes a random sample of rows of self.df according to the provided number of rows specified as parameter (default: 5) if self.df is not empty nor None + + def get_sample(self, n=5): + if not self.is_df_none(): + return self.df.sample(n) + +# Class method that computes the Dataframe containing the list of columns with their data types and memory usage and store the results in the relevant attribute (self.table) if self.df is not empty nor None + + def set_table(self): + if not self.is_df_none(): + self.table = pd.DataFrame({'Column Name': self.cols_list}) + self.table['Data Type'] = self.df.dtypes.values + self.table['Memory Usage (KB)'] = self.df.memory_usage(deep=True) / 1024 + +# Class method that formats all requested information from self.df to be displayed in the Dataframe tab of Streamlit app as a Pandas dataframe with 2 columns: Description and Value + + def get_summary(self): + summary_data = { + 'Description': ['Number of Rows', 'Number of Columns', 'Number of Duplicated Rows', + 'Number of Rows with Missing Values', 'Number of Numeric Columns', + 'Number of Text Columns'], + 'Value': [self.n_rows, self.n_cols, self.n_duplicates, self.n_missing, + self.n_num_cols, self.n_text_cols] + } + return pd.DataFrame(summary_data, columns=['Description', 'Value']) + + From 92e7bc6b35858abd97f9eafbf971ef26fb6c9d3e Mon Sep 17 00:00:00 2001 From: Anna Roy Date: Wed, 8 Nov 2023 23:13:08 +1100 Subject: [PATCH 2/7] Fixed a logic in display.py --- tab_df/display.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tab_df/display.py b/tab_df/display.py index e76d730..1c12400 100644 --- a/tab_df/display.py +++ b/tab_df/display.py @@ -1,14 +1,20 @@ - import streamlit as st -from logics import Dataset +from logics import Dataset def display_tab_df_content(file_path): # Instantiate the Dataset class and set the data dataset = Dataset(file_path) - dataset.set_data() + dataset.set_df() + dataset.set_data() + dataset.set_dimensions() + dataset.set_numeric() + dataset.set_columns() - st.title("Data Analysis - DataFrame") + # Store the dataset in st.session_state + st.session_state['dataset'] = dataset + + st.title("DataFrame") # Expander for displaying dataset summary with st.expander("Dataset Summary"): @@ -30,4 +36,3 @@ def display_tab_df_content(file_path): # Display column information st.header("Column Information") st.table(dataset.table) - From 64b97209b2c345dbfdb0705a75ca13ce98f460f9 Mon Sep 17 00:00:00 2001 From: Anna Roy Date: Fri, 10 Nov 2023 00:16:53 +1100 Subject: [PATCH 3/7] Fixed code --- README.md | 1 - tab_df/__int__.py => __int__.py | 0 tab_df/display.py => display.py | 16 +++++++++------- tab_df/logics.py => logics.py | 18 +++++++++++++----- tab_df/__pycache__/logics.cpython-39.pyc | Bin 3569 -> 0 bytes 5 files changed, 22 insertions(+), 13 deletions(-) delete mode 100644 README.md rename tab_df/__int__.py => __int__.py (100%) rename tab_df/display.py => display.py (67%) rename tab_df/logics.py => logics.py (89%) delete mode 100644 tab_df/__pycache__/logics.cpython-39.pyc diff --git a/README.md b/README.md deleted file mode 100644 index 7d48bf1..0000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -# DSP_Assignment_3-Group_6- \ No newline at end of file diff --git a/tab_df/__int__.py b/__int__.py similarity index 100% rename from tab_df/__int__.py rename to __int__.py diff --git a/tab_df/display.py b/display.py similarity index 67% rename from tab_df/display.py rename to display.py index 1c12400..928dfa5 100644 --- a/tab_df/display.py +++ b/display.py @@ -17,22 +17,24 @@ def display_tab_df_content(file_path): st.title("DataFrame") # Expander for displaying dataset summary - with st.expander("Dataset Summary"): + with st.expander("Dataframe"): summary_data = dataset.get_summary() st.table(summary_data) # Expander for selecting rows to display - with st.expander("Select Rows to Display"): - num_rows = st.slider("Select the number of rows to display", 5, 50, 5) - display_method = st.radio("Select the display method", ("head", "tail", "sample")) + with st.expander("Explore Dataframe"): + num_rows = st.slider("Select the number of rows to be displayed", 5, 50, 5) + display_method = st.radio("Exploration Method", ("Head", "Tail", "Sample")) - if display_method == "head": + st.header("Top Rows of Selected Table") + + if display_method == "Head": st.dataframe(dataset.get_head(num_rows)) - elif display_method == "tail": + elif display_method == "Tail": st.dataframe(dataset.get_tail(num_rows)) else: st.dataframe(dataset.get_sample(num_rows)) # Display column information - st.header("Column Information") + st.header("Columns") st.table(dataset.table) diff --git a/tab_df/logics.py b/logics.py similarity index 89% rename from tab_df/logics.py rename to logics.py index 6ec70c8..aeeba4c 100644 --- a/tab_df/logics.py +++ b/logics.py @@ -1,4 +1,5 @@ import pandas as pd +import numpy as np class Dataset: def __init__(self, file_path): @@ -89,12 +90,20 @@ def get_sample(self, n=5): return self.df.sample(n) # Class method that computes the Dataframe containing the list of columns with their data types and memory usage and store the results in the relevant attribute (self.table) if self.df is not empty nor None - + def set_table(self): if not self.is_df_none(): - self.table = pd.DataFrame({'Column Name': self.cols_list}) - self.table['Data Type'] = self.df.dtypes.values - self.table['Memory Usage (KB)'] = self.df.memory_usage(deep=True) / 1024 + self.table = pd.DataFrame({'column': self.cols_list}) + self.table['data_type'] = self.df.dtypes.values + + # Calculate memory usage for each column + memory_usage = [] + for column in self.cols_list: + mem = self.df[column].memory_usage(deep=True, index=False) / (1024 * 1024) # Calculate memory usage in MB + memory_usage.append(f"{mem:.2f} MB") # Format memory usage to display as "X.XX MB" + + self.table['memory'] = memory_usage + # Class method that formats all requested information from self.df to be displayed in the Dataframe tab of Streamlit app as a Pandas dataframe with 2 columns: Description and Value @@ -108,4 +117,3 @@ def get_summary(self): } return pd.DataFrame(summary_data, columns=['Description', 'Value']) - diff --git a/tab_df/__pycache__/logics.cpython-39.pyc b/tab_df/__pycache__/logics.cpython-39.pyc deleted file mode 100644 index db968a84d9b718a0d58e4f6579e68f8cdf9d4463..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3569 zcmb_e&2JM&6yMn|uRrn$4dJ8I4X9<+LRC&V36_ypUgnMP&bMZr^S`-Pvy5tZzSf+{5T|h#1UBv^<0Gd+I-2&Xsv%B;KM+kABo2Yjq0;hsLq- z#$d!*&dr=#Ik$7}U%Rl*~k zssU;>;&DrQw35^+%c@6OJz;zSF)M9qvV=n|3g|=2I1)q*=1u)4T2XjV^psY}P|DXX zVdv23;LQdGU(eXFi6iSx?tR+^UqJ!i=41wG_Cb0)3L$8T7s}Y2!gB^8U&U{{*-YGa^BT-Tq!5JOS)Zz0^l9gDX5rVXrMpx)S+Q(qS)zNX2>31JVH_(0| z)-t?z$Lv@~bbp(yt?%#f%P1vniW;qoQf_nZ;KV#^9J88n4O^}Sj}6a^V38K`^`~&n zScd#ntRpJcF`H4YoU6nSaN;Jmi4v#Lj$`W-+ObR3ae0pR z66x_6Bc5UDY-Q z*2YKbHS1WvvCKYX$`c)x(X9q1>OuE;#7~$k#c3-UK;*m1H8to3Ou|tM^HO%uj|8vd z?tFs?g_Uorbh2B^bB;>jv!Qzq1@B-Jp!_&T5A_WN)z6n4|-%pmTd4 z^WljGCRsWINP2CIbrSi3Q0at-sbGw0H2vP-WMQ0e3h=o|X!gmwkc@2OaOH8-?eT;9QxUeK z`nB)x`F>e@1HE`st$WCE3;i(E@ME4(bWlq7MVJh*x~g^6M!|qKnwwnC`kGP@r5DQmDp6BvYGdvECn<(5Ok6$iba`w}+XeFv6H^T-iji{KCOEH6 zEFj&6?+zCmD&f?7d-W$+DGpaAW34VRJU{ugHA=^zr8g#1w7UK>&i3n%bxm3SN$H`u z9Hprks*iSFn=IeKupNEJYO}f(i58Fh8J1bnQ9JQ(@v{6j5}&EG;d_ObP(3gzjZ-S* zluS4ojDJN7^60H$uJ4BYAOJM}Z1_}_M{0ox7YQ=0VY=2b^ZqmcbxV`yy&%~@FrU#8 z(`=RM+Z4Sd2xKJ)x;+?Ri5|#W5d1O-6FtMJw&NFxP(`J^jH(JwZ#bu?k<$ajDQR=c zTKo$lUlO@Rggl6E5}}H`K&X)9qnAifZG#nL4gV$2QGeHSv0tjV6{q5Qo|5$~>Gs1E aLZMYh>kE6TBXotjsV{VgqE*#op7}p^zVa>r From b51982c344c2ecc7e19d3e74bf2c3db8e4df5fa6 Mon Sep 17 00:00:00 2001 From: Anna Roy Date: Fri, 10 Nov 2023 00:29:29 +1100 Subject: [PATCH 4/7] file --- README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 From a269366b87b610bfe9fbbdaa9d275587eed63f03 Mon Sep 17 00:00:00 2001 From: savanth-nair Date: Thu, 9 Nov 2023 21:36:48 +0530 Subject: [PATCH 5/7] working code --- display.py | 51 ++++++--------- logics.py | 187 ++++++++++++++++++++++++++--------------------------- 2 files changed, 110 insertions(+), 128 deletions(-) diff --git a/display.py b/display.py index 928dfa5..ae5f85c 100644 --- a/display.py +++ b/display.py @@ -1,40 +1,27 @@ +# tab_text/display.py import streamlit as st +from tab_text.logics import TextColumn -from logics import Dataset +def display_tab_text_content(file_path=None,df=None): + st.title("Text Serie Analysis") -def display_tab_df_content(file_path): - # Instantiate the Dataset class and set the data - dataset = Dataset(file_path) - dataset.set_df() - dataset.set_data() - dataset.set_dimensions() - dataset.set_numeric() - dataset.set_columns() + text_column_instance = TextColumn(file_path, df) + text_column_instance.find_text_cols() - # Store the dataset in st.session_state - st.session_state['dataset'] = dataset + if not text_column_instance.cols_list: + st.warning("No text columns found in the dataset.") + return - st.title("DataFrame") + selected_column = st.selectbox("Select a text column to explore:", text_column_instance.cols_list) + print(selected_column) + text_column_instance.set_data(selected_column) - # Expander for displaying dataset summary - with st.expander("Dataframe"): - summary_data = dataset.get_summary() - st.table(summary_data) + with st.expander("Text Column Analysis"): + st.subheader("Summary:") + st.table(text_column_instance.get_summary()) - # Expander for selecting rows to display - with st.expander("Explore Dataframe"): - num_rows = st.slider("Select the number of rows to be displayed", 5, 50, 5) - display_method = st.radio("Exploration Method", ("Head", "Tail", "Sample")) + st.subheader("Histogram:") + st.altair_chart(text_column_instance.barchart) - st.header("Top Rows of Selected Table") - - if display_method == "Head": - st.dataframe(dataset.get_head(num_rows)) - elif display_method == "Tail": - st.dataframe(dataset.get_tail(num_rows)) - else: - st.dataframe(dataset.get_sample(num_rows)) - - # Display column information - st.header("Columns") - st.table(dataset.table) + st.subheader("Top 20 Most Frequent Values:") + st.dataframe(text_column_instance.frequent) diff --git a/logics.py b/logics.py index aeeba4c..0b92917 100644 --- a/logics.py +++ b/logics.py @@ -1,119 +1,114 @@ +# tab_text/logics.py + import pandas as pd -import numpy as np +import altair as alt -class Dataset: - def __init__(self, file_path): +class TextColumn: + def __init__(self, file_path=None, df=None): self.file_path = file_path - self.df = None + self.df = df self.cols_list = [] - self.n_rows = 0 - self.n_cols = 0 - self.n_duplicates = 0 - self.n_missing = 0 - self.n_num_cols = 0 - self.n_text_cols = 0 - self.table = None - - def set_data(self): - self.set_df() - self.set_columns() - self.set_dimensions() - self.set_duplicates() + self.serie = None + self.n_unique = None + self.n_missing = None + self.n_empty = None + self.n_mode = None + self.n_space = None + self.n_lower = None + self.n_upper = None + self.n_alpha = None + self.n_digit = None + self.barchart = alt.Chart() + self.frequent = pd.DataFrame(columns=['value', 'occurrence', 'percentage']) + + def find_text_cols(self): + if self.df is None and self.file_path is not None: + self.df = pd.read_csv(self.file_path) + + if self.df is not None: + self.cols_list = [col for col in self.df.columns if self.df[col].dtype == 'int64'] + + def set_data(self, col_name): + self.serie = self.df[col_name] if col_name in self.df.columns else None + if self.is_serie_none(): + return + + self.convert_serie_to_text() + self.set_unique() self.set_missing() - self.set_numeric() - self.set_text() - self.set_table() - -# Class method that will load the uploaded CSV file as Pandas DataFrame and store it as attribute (self.df) if it hasn't been provided before. - - def set_df(self): - if self.df is None: - self.df = pd.read_csv(self.file_path) - - -# Class method that checks if self.df is empty or none - - def is_df_none(self): - return self.df is None + self.set_empty() + self.set_mode() + self.set_whitespace() + self.set_lowercase() + self.set_uppercase() + self.set_alphabet() + self.set_digit() + self.set_barchart() + self.set_frequent() -# Class method that extract the list of columns names and store the results in the relevant attribute (self.cols_list) if self.df is not empty nor None + def convert_serie_to_text(self): + self.serie = self.serie.astype(str) - def set_columns(self): - if not self.is_df_none(): - self.cols_list = list(self.df.columns) + def is_serie_none(self): + return self.serie is None or self.serie.empty -# Class method that computes the dimensions (number of columns and rows) of self.df and store the results in the relevant attributes (self.n_rows, self.n_cols) if self.df is not empty nor None - def set_dimensions(self): - if not self.is_df_none(): - self.n_rows, self.n_cols = self.df.shape -# Class method that computes the number of duplicated of self.df and store the results in the relevant attribute (self.n_duplicates) if self.df is not empty nor None - - def set_duplicates(self): - if not self.is_df_none(): - self.n_duplicates = len(self.df) - len(self.df.drop_duplicates()) - -# Class method that computes the number of missing values of self.df and store the results in the relevant attribute (self.n_missing) if self.df is not empty nor None + def set_unique(self): + self.n_unique = self.serie.nunique() def set_missing(self): - if not self.is_df_none(): - self.n_missing = self.df.isnull().sum().sum() - -# Class method that computes the number of columns that are numeric type and store the results in the relevant attribute (self.n_num_cols) if self.df is not empty nor None + self.n_missing = self.serie.isnull().sum() - def set_numeric(self): - if not self.is_df_none(): - numeric_cols = self.df.select_dtypes(include=['number']).columns - self.n_num_cols = len(numeric_cols) + def set_empty(self): + self.n_empty = (self.serie == '').sum() -# Class method that computes the number of columns that are text type and store the results in the relevant attribute (self.n_text_cols) if self.df is not empty nor None + def set_mode(self): + self.n_mode = self.serie.mode().iloc[0] - def set_text(self): - if not self.is_df_none(): - text_cols = self.df.select_dtypes(include=['object']).columns - self.n_text_cols = len(text_cols) + def set_whitespace(self): + self.n_space = self.serie.apply(lambda x: x.isspace()).sum() -# Class method that computes the first rows of self.df according to the provided number of rows specified as parameter (default: 5) if self.df is not empty nor None + def set_lowercase(self): + self.n_lower = self.serie.str.islower().sum() - def get_head(self, n=5): - if not self.is_df_none(): - return self.df.head(n) + def set_uppercase(self): + self.n_upper = self.serie.str.isupper().sum() -# Class method that computes the last rows of self.df according to the provided number of rows specified as parameter (default: 5) if self.df is not empty nor None + def set_alphabet(self): + self.n_alpha = self.serie.apply(lambda x: x.isalpha()).sum() - def get_tail(self, n=5): - if not self.is_df_none(): - return self.df.tail(n) -# Class method that computes a random sample of rows of self.df according to the provided number of rows specified as parameter (default: 5) if self.df is not empty nor None + def set_digit(self): + self.n_digit = self.serie.apply(lambda x: x.isdigit()).sum() - def get_sample(self, n=5): - if not self.is_df_none(): - return self.df.sample(n) + def set_barchart(self): + print(self.serie.reset_index()) + chart = alt.Chart(self.serie.reset_index(), height=200).mark_bar().encode( + x=alt.X('SalePrice:Q', title='Values', bin=True), + #y=alt.Y('count()', title='Count',bin=True), + y='count()', + #tooltip=['SalePrice:Q', 'count()'] + ).interactive() -# Class method that computes the Dataframe containing the list of columns with their data types and memory usage and store the results in the relevant attribute (self.table) if self.df is not empty nor None - - def set_table(self): - if not self.is_df_none(): - self.table = pd.DataFrame({'column': self.cols_list}) - self.table['data_type'] = self.df.dtypes.values + self.barchart = chart - # Calculate memory usage for each column - memory_usage = [] - for column in self.cols_list: - mem = self.df[column].memory_usage(deep=True, index=False) / (1024 * 1024) # Calculate memory usage in MB - memory_usage.append(f"{mem:.2f} MB") # Format memory usage to display as "X.XX MB" + def set_frequent(self, end=20): + value_counts = self.serie.value_counts().head(end).reset_index() + value_counts.columns = ['value', 'occurrence'] + value_counts['percentage'] = (value_counts['occurrence'] / len(self.serie)) * 100 - self.table['memory'] = memory_usage - - -# Class method that formats all requested information from self.df to be displayed in the Dataframe tab of Streamlit app as a Pandas dataframe with 2 columns: Description and Value + self.frequent = value_counts def get_summary(self): - summary_data = { - 'Description': ['Number of Rows', 'Number of Columns', 'Number of Duplicated Rows', - 'Number of Rows with Missing Values', 'Number of Numeric Columns', - 'Number of Text Columns'], - 'Value': [self.n_rows, self.n_cols, self.n_duplicates, self.n_missing, - self.n_num_cols, self.n_text_cols] - } - return pd.DataFrame(summary_data, columns=['Description', 'Value']) - + summary_data = [ + ("Number of Unique Values", self.n_unique), + ("Number of Rows with Missing Values", self.n_missing), + ("Number of Empty Rows", self.n_empty), + ("Number of Rows with Only Whitespaces", self.n_space), + ("Number of Rows with Only Lowercases", self.n_lower), + ("Number of Rows with Only Uppercases", self.n_upper), + ("Number of Rows with Alphabets", self.n_alpha), + ("Number of Rows with Numbers", self.n_digit), + ("Mode Value", self.n_mode), + ] + + summary_df = pd.DataFrame(summary_data, columns=['Description', 'Value']) + return summary_df From 63b9c931dc241241c88611f4b562699c278efe8a Mon Sep 17 00:00:00 2001 From: savanth-nair Date: Fri, 10 Nov 2023 12:00:51 +0530 Subject: [PATCH 6/7] Made changes to logics file --- display.py | 26 ++++++++ git/DSP_Assignment_3-Group_6- | 1 + logics.py | 112 ++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 display.py create mode 160000 git/DSP_Assignment_3-Group_6- create mode 100644 logics.py diff --git a/display.py b/display.py new file mode 100644 index 0000000..69c32ae --- /dev/null +++ b/display.py @@ -0,0 +1,26 @@ +# tab_text/display.py +import streamlit as st +from tab_text.logics import TextColumn + +def display_tab_text_content(file_path=None,df=None): + st.title("Text Serie Analysis") + + text_column_instance = TextColumn(file_path, df) + text_column_instance.find_text_cols() + + if not text_column_instance.cols_list: + st.warning("No text columns found in the dataset.") + return + + selected_column = st.selectbox("Select a text column to explore:", text_column_instance.cols_list) + text_column_instance.set_data(selected_column) + + with st.expander("Text Column Analysis"): + st.subheader("Summary:") + st.table(text_column_instance.get_summary()) + + st.subheader("Histogram:") + st.altair_chart(text_column_instance.barchart) + + st.subheader("Top 20 Most Frequent Values:") + st.dataframe(text_column_instance.frequent) diff --git a/git/DSP_Assignment_3-Group_6- b/git/DSP_Assignment_3-Group_6- new file mode 160000 index 0000000..47ccb65 --- /dev/null +++ b/git/DSP_Assignment_3-Group_6- @@ -0,0 +1 @@ +Subproject commit 47ccb65f4de220d45ce82bfb0a66b6b5c9b9dd50 diff --git a/logics.py b/logics.py new file mode 100644 index 0000000..5242474 --- /dev/null +++ b/logics.py @@ -0,0 +1,112 @@ +# tab_text/logics.py + +import pandas as pd +import altair as alt + +class TextColumn: + def __init__(self, file_path=None, df=None): + self.file_path = file_path + self.df = df + self.cols_list = [] + self.serie = None + self.n_unique = None + self.n_missing = None + self.n_empty = None + self.n_mode = None + self.n_space = None + self.n_lower = None + self.n_upper = None + self.n_alpha = None + self.n_digit = None + self.barchart = alt.Chart() + self.frequent = pd.DataFrame(columns=['value', 'occurrence', 'percentage']) + + def find_text_cols(self): + if self.df is None and self.file_path is not None: + self.df = pd.read_csv(self.file_path) + + if self.df is not None: + self.cols_list = [col for col in self.df.columns if self.df[col].dtype == 'object'] + + def set_data(self, col_name): + self.serie = self.df[col_name] if col_name in self.df.columns else None + if self.is_serie_none(): + return + + self.convert_serie_to_text() + self.set_unique() + self.set_missing() + self.set_empty() + self.set_mode() + self.set_whitespace() + self.set_lowercase() + self.set_uppercase() + self.set_alphabet() + self.set_digit() + self.set_barchart() + self.set_frequent() + + def convert_serie_to_text(self): + self.serie = self.serie.astype(str) + + def is_serie_none(self): + return self.serie is None or self.serie.empty + + def set_unique(self): + self.n_unique = self.serie.nunique() + + def set_missing(self): + self.n_missing = self.serie.isnull().sum() + + def set_empty(self): + self.n_empty = (self.serie == '').sum() + + def set_mode(self): + self.n_mode = self.serie.mode().iloc[0] + + def set_whitespace(self): + self.n_space = self.serie.apply(lambda x: x.isspace()).sum() + + def set_lowercase(self): + self.n_lower = self.serie.str.islower().sum() + + def set_uppercase(self): + self.n_upper = self.serie.str.isupper().sum() + + def set_alphabet(self): + self.n_alpha = self.serie.apply(lambda x: x.isalpha()).sum() + + def set_digit(self): + self.n_digit = self.serie.apply(lambda x: x.isdigit()).sum() + + def set_barchart(self): + chart = alt.Chart(self.serie.reset_index(), height=300).mark_bar().encode( + x=alt.X('index:N', title='Values'), + y=alt.Y('count()', title='Count'), + tooltip=['index:N', 'count()'] + ).interactive() + + self.barchart = chart + + def set_frequent(self, end=20): + value_counts = self.serie.value_counts().head(end).reset_index() + value_counts.columns = ['value', 'occurrence'] + value_counts['percentage'] = (value_counts['occurrence'] / len(self.serie)) * 100 + + self.frequent = value_counts + + def get_summary(self): + summary_data = [ + ("Number of Unique Values", self.n_unique), + ("Number of Missing Values", self.n_missing), + ("Number of Empty Values", self.n_empty), + ("Mode Value", self.n_mode), + ("Number of Whitespace Values", self.n_space), + ("Number of Lowercase Values", self.n_lower), + ("Number of Uppercase Values", self.n_upper), + ("Number of Alphabetical Values", self.n_alpha), + ("Number of Numeric Values", self.n_digit) + ] + + summary_df = pd.DataFrame(summary_data, columns=['Description', 'Value']) + return summary_df From af92fa7f9b6286d7ad3b4ac37638a190a3051344 Mon Sep 17 00:00:00 2001 From: savanth-nair Date: Fri, 10 Nov 2023 12:41:31 +0530 Subject: [PATCH 7/7] Changes made in Logics file --- logics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/logics.py b/logics.py index 0b92917..8f98fca 100644 --- a/logics.py +++ b/logics.py @@ -43,7 +43,7 @@ def set_data(self, col_name): self.set_uppercase() self.set_alphabet() self.set_digit() - self.set_barchart() + self.set_barchart(col_name) self.set_frequent() def convert_serie_to_text(self): @@ -79,10 +79,10 @@ def set_alphabet(self): def set_digit(self): self.n_digit = self.serie.apply(lambda x: x.isdigit()).sum() - def set_barchart(self): + def set_barchart(self,col_name): print(self.serie.reset_index()) chart = alt.Chart(self.serie.reset_index(), height=200).mark_bar().encode( - x=alt.X('SalePrice:Q', title='Values', bin=True), + x=alt.X(col_name+':Q', title='Values', bin=True), #y=alt.Y('count()', title='Count',bin=True), y='count()', #tooltip=['SalePrice:Q', 'count()']