In [1]:
import pandas as pd

# Read the file with error handling to identify the issue
try:
    df = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_entrevistas.csv")
except pd.errors.ParserError as e:
    print(f"Error encountered: {e}")
    # Read the file with error_bad_lines=False to skip problematic lines
    df = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_entrevistas.csv", error_bad_lines=False)

# Display the first few rows to inspect the data
df.head()

Error encountered: Error tokenizing data. C error: Expected 1 fields in line 50, saw 2



TypeError: read_csv() got an unexpected keyword argument 'error_bad_lines'

In [2]:
import pandas as pd

# Read the file with error handling to identify the issue
try:
    df = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_entrevistas.csv", on_bad_lines='skip')
except pd.errors.ParserError as e:
    print(f"Error encountered: {e}")
    # Read the file with on_bad_lines='skip' to skip problematic lines
    df = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_entrevistas.csv", on_bad_lines='skip')

# Display the first few rows to inspect the data
print(df.head())

# Further analysis to understand the structure of the file
print("Number of columns:", len(df.columns))
print("Columns:", df.columns.tolist())

# Check for any missing or inconsistent data
print("Missing values per column:")
print(df.isnull().sum())

# Check the data types of each column
print("Data types per column:")
print(df.dtypes)

  entrevistas_id;descricao_campo;formatacao_campo;opcoes_lista_selecao;tipo_campo
0  1;numero_vara;número;-;Entrada por formulário web                             
1   2;cidade_vara;texto;-;Entrada por formulário web                             
2       3;uf_vara;texto;-;Entrada por formulário web                             
3  4;n_regiao_trt;número;-;Entrada por formulário...                             
4  5;informacoes_cliente_procuracao;texto;-;Entra...                             
Number of columns: 1
Columns: ['entrevistas_id;descricao_campo;formatacao_campo;opcoes_lista_selecao;tipo_campo']
Missing values per column:
entrevistas_id;descricao_campo;formatacao_campo;opcoes_lista_selecao;tipo_campo    0
dtype: int64
Data types per column:
entrevistas_id;descricao_campo;formatacao_campo;opcoes_lista_selecao;tipo_campo    object
dtype: object


In [3]:
# Correctly read the CSV file with semicolon delimiter
df = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_entrevistas.csv", delimiter=';')

# Verify the data was read correctly
print("Data successfully loaded with correct delimiter")
print("Number of rows:", len(df))
print("Columns:", df.columns.tolist())
print("\nFirst 5 rows:")
print(df.head())

# Check data quality
print("\nData quality check:")
print("Missing values per column:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes)

# Make a copy for further processing
df_processed = df.copy()

Data successfully loaded with correct delimiter
Number of rows: 507
Columns: ['entrevistas_id', 'descricao_campo', 'formatacao_campo', 'opcoes_lista_selecao', 'tipo_campo']

First 5 rows:
   entrevistas_id                 descricao_campo formatacao_campo  \
0               1                     numero_vara           número   
1               2                     cidade_vara            texto   
2               3                         uf_vara            texto   
3               4                    n_regiao_trt           número   
4               5  informacoes_cliente_procuracao            texto   

  opcoes_lista_selecao                  tipo_campo  
0                    -  Entrada por formulário web  
1                    -  Entrada por formulário web  
2                    -  Entrada por formulário web  
3                    -  Entrada por formulário web  
4                    -  Entrada por formulário web  

Data quality check:
Missing values per column:
entrevistas_id          0

In [4]:
import pandas as pd
import json

# Load and analyze d_entrevistas.csv
df_entrevistas = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_entrevistas.csv", delimiter=';')
entrevistas_summary = {
    "num_rows": len(df_entrevistas),
    "columns": df_entrevistas.columns.tolist(),
    "missing_values": df_entrevistas.isnull().sum().to_dict(),
    "data_types": df_entrevistas.dtypes.astype(str).to_dict()
}

# Load and analyze d_template.csv
df_template = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_template.csv")
template_summary = {
    "num_rows": len(df_template),
    "columns": df_template.columns.tolist(),
    "missing_values": df_template.isnull().sum().to_dict(),
    "data_types": df_template.dtypes.astype(str).to_dict()
}

# Load and analyze f_entrevistas.xlsx
df_f_entrevistas = pd.read_excel("/data/chats/0sf34/workspace/uploads/f_entrevistas.xlsx")
f_entrevistas_summary = {
    "num_rows": len(df_f_entrevistas),
    "columns": df_f_entrevistas.columns.tolist(),
    "missing_values": df_f_entrevistas.isnull().sum().to_dict(),
    "data_types": df_f_entrevistas.dtypes.astype(str).to_dict()
}

# Load and analyze d_condicionais_regras_simples.csv
df_condicionais_simples = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_condicionais_regras_simples.csv")
condicionais_simples_summary = {
    "num_rows": len(df_condicionais_simples),
    "columns": df_condicionais_simples.columns.tolist(),
    "missing_values": df_condicionais_simples.isnull().sum().to_dict(),
    "data_types": df_condicionais_simples.dtypes.astype(str).to_dict()
}

# Load and analyze d_tipo_condicional.csv
df_tipo_condicional = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_tipo_condicional.csv")
tipo_condicional_summary = {
    "num_rows": len(df_tipo_condicional),
    "columns": df_tipo_condicional.columns.tolist(),
    "missing_values": df_tipo_condicional.isnull().sum().to_dict(),
    "data_types": df_tipo_condicional.dtypes.astype(str).to_dict()
}

# Combine all summaries into a single report
analysis_report = {
    "d_entrevistas": entrevistas_summary,
    "d_template": template_summary,
    "f_entrevistas": f_entrevistas_summary,
    "d_condicionais_regras_simples": condicionais_simples_summary,
    "d_tipo_condicional": tipo_condicional_summary
}

# Save the analysis report to a JSON file
with open("/data/chats/0sf34/workspace/analysis_report.json", "w") as f:
    json.dump(analysis_report, f, indent=4)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 5, saw 2


In [5]:
import pandas as pd
import json
from docx import Document

# Load and analyze d_entrevistas.csv
df_entrevistas = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_entrevistas.csv", delimiter=';')
entrevistas_summary = {
    "num_rows": len(df_entrevistas),
    "columns": df_entrevistas.columns.tolist(),
    "missing_values": df_entrevistas.isnull().sum().to_dict(),
    "data_types": df_entrevistas.dtypes.astype(str).to_dict()
}

# Load and analyze d_template.csv
df_template = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_template.csv")
template_summary = {
    "num_rows": len(df_template),
    "columns": df_template.columns.tolist(),
    "missing_values": df_template.isnull().sum().to_dict(),
    "data_types": df_template.dtypes.astype(str).to_dict()
}

# Load and analyze f_entrevistas.xlsx
df_f_entrevistas = pd.read_excel("/data/chats/0sf34/workspace/uploads/f_entrevistas.xlsx")
f_entrevistas_summary = {
    "num_rows": len(df_f_entrevistas),
    "columns": df_f_entrevistas.columns.tolist(),
    "missing_values": df_f_entrevistas.isnull().sum().to_dict(),
    "data_types": df_f_entrevistas.dtypes.astype(str).to_dict()
}

# Load and analyze d_condicionais_regras_simples.csv with error handling
try:
    df_condicionais_simples = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_condicionais_regras_simples.csv", on_bad_lines='skip')
except pd.errors.ParserError as e:
    print(f"Error encountered: {e}")
    df_condicionais_simples = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_condicionais_regras_simples.csv", delimiter=';', on_bad_lines='skip')

condicionais_simples_summary = {
    "num_rows": len(df_condicionais_simples),
    "columns": df_condicionais_simples.columns.tolist(),
    "missing_values": df_condicionais_simples.isnull().sum().to_dict(),
    "data_types": df_condicionais_simples.dtypes.astype(str).to_dict()
}

# Load and analyze d_tipo_condicional.csv
df_tipo_condicional = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_tipo_condicional.csv")
tipo_condicional_summary = {
    "num_rows": len(df_tipo_condicional),
    "columns": df_tipo_condicional.columns.tolist(),
    "missing_values": df_tipo_condicional.isnull().sum().to_dict(),
    "data_types": df_tipo_condicional.dtypes.astype(str).to_dict()
}

# Load and analyze Documentação atual do projeto 24.04.2025.docx
doc_project = Document("/data/chats/0sf34/workspace/uploads/Documentação atual do projeto 24.04.2025.docx")
project_doc_summary = {
    "num_paragraphs": len(doc_project.paragraphs),
    "text": [para.text for para in doc_project.paragraphs]
}

# Load and analyze template.docx
doc_template = Document("/data/chats/0sf34/workspace/uploads/template.docx")
template_doc_summary = {
    "num_paragraphs": len(doc_template.paragraphs),
    "text": [para.text for para in doc_template.paragraphs]
}

# Combine all summaries into a single report
analysis_report = {
    "d_entrevistas": entrevistas_summary,
    "d_template": template_summary,
    "f_entrevistas": f_entrevistas_summary,
    "d_condicionais_regras_simples": condicionais_simples_summary,
    "d_tipo_condicional": tipo_condicional_summary,
    "Documentação atual do projeto 24.04.2025": project_doc_summary,
    "template": template_doc_summary
}

# Save the analysis report to a JSON file
with open("/data/chats/0sf34/workspace/analysis_report.json", "w") as f:
    json.dump(analysis_report, f, indent=4)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 3, saw 5


In [6]:
import pandas as pd
import json
from docx import Document

# Load and analyze d_entrevistas.csv
df_entrevistas = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_entrevistas.csv", delimiter=';')
entrevistas_summary = {
    "num_rows": len(df_entrevistas),
    "columns": df_entrevistas.columns.tolist(),
    "missing_values": df_entrevistas.isnull().sum().to_dict(),
    "data_types": df_entrevistas.dtypes.astype(str).to_dict()
}

# Load and analyze d_template.csv
df_template = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_template.csv")
template_summary = {
    "num_rows": len(df_template),
    "columns": df_template.columns.tolist(),
    "missing_values": df_template.isnull().sum().to_dict(),
    "data_types": df_template.dtypes.astype(str).to_dict()
}

# Load and analyze f_entrevistas.xlsx
df_f_entrevistas = pd.read_excel("/data/chats/0sf34/workspace/uploads/f_entrevistas.xlsx")
f_entrevistas_summary = {
    "num_rows": len(df_f_entrevistas),
    "columns": df_f_entrevistas.columns.tolist(),
    "missing_values": df_f_entrevistas.isnull().sum().to_dict(),
    "data_types": df_f_entrevistas.dtypes.astize(str).to_dict()
}

# Load and analyze d_condicionais_regras_simples.csv with error handling
try:
    df_condicionais_simples = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_condicionais_regras_simples.csv", delimiter=';', on_bad_lines='skip')
except pd.errors.ParserError as e:
    print(f"Error encountered: {e}")
    df_condicionais_simples = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_condicionais_regras_simples.csv", on_bad_lines='skip')

condicionais_simples_summary = {
    "num_rows": len(df_condicionais_simples),
    "columns": df_condicionais_simples.columns.tolist(),
    "missing_values": df_condicionais_simples.isnull().sum().to_dict(),
    "data_types": df_condicionais_simples.dtypes.astype(str).to_dict()
}

# Load and analyze d_tipo_condicional.csv with error handling
try:
    df_tipo_condicional = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_tipo_condicional.csv", delimiter=';', on_bad_lines='skip')
except pd.errors.ParserError as e:
    print(f"Error encountered: {e}")
    df_tipo_condicional = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_tipo_condicional.csv", on_bad_lines='skip')

tipo_condicional_summary = {
    "num_rows": len(df_tipo_condicional),
    "columns": df_tipo_condicional.columns.tolist(),
    "missing_values": df_tipo_condicional.isnull().sum().to_dict(),
    "data_types": df_tipo_condicional.dtypes.astype(str).to_dict()
}

# Load and analyze Documentação atual do projeto 24.04.2025.docx
doc_project = Document("/data/chats/0sf34/workspace/uploads/Documentação atual do projeto 24.04.2025.docx")
project_doc_summary = {
    "num_paragraphs": len(doc_project.paragraphs),
    "headings": [para.text for para in doc_project.paragraphs if para.style.name.startswith('Heading')],
    "tables": [len(table.rows) for table in doc_project.tables],
    "text": [para.text for para in doc_project.paragraphs]
}

# Load and analyze template.docx
doc_template = Document("/data/chats/0sf34/workspace/uploads/template.docx")
template_doc_summary = {
    "num_paragraphs": len(doc_template.paragraphs),
    "headings": [para.text for para in doc_template.paragraphs if para.style.name.startswith('Heading')],
    "tables": [len(table.rows) for table in doc_template.tables],
    "text": [para.text for para in doc_template.paragraphs]
}

# Combine all summaries into a single report
analysis_report = {
    "d_entrevistas": entrevistas_summary,
    "d_template": template_summary,
    "f_entrevistas": f_entrevistas_summary,
    "d_condicionais_regras_simples": condicionais_simples_summary,
    "d_tipo_condicional": tipo_condicional_summary,
    "Documentação atual do projeto 24.04.2025": project_doc_summary,
    "template": template_doc_summary
}

# Save the analysis report to a JSON file
with open("/data/chats/0sf34/workspace/analysis_report.json", "w") as f:
    json.dump(analysis_report, f, indent=4)

AttributeError: 'Series' object has no attribute 'astize'

In [7]:
import pandas as pd
import numpy as np
from docx import Document
import json

# Correct the typo and rerun the analysis for f_entrevistas.xlsx
df_f_entrevistas = pd.read_excel("/data/chats/0sf34/workspace/uploads/f_entrevistas.xlsx")
f_entrevistas_summary = {
    "num_rows": len(df_f_entrevistas),
    "columns": df_f_entrevistas.columns.tolist(),
    "missing_values": df_f_entrevistas.isnull().sum().to_dict(),
    "data_types": df_f_entrevistas.dtypes.astype(str).to_dict()
}

# Update the analysis report with the corrected data
analysis_report = {
    "d_entrevistas": entrevistas_summary,
    "d_template": template_summary,
    "f_entrevistas": f_entrevistas_summary,
    "d_condicionais_regras_simples": condicionais_simples_summary,
    "d_tipo_condicional": tipo_condicional_summary,
    "Documentação atual do projeto 24.04.2025": project_doc_summary,
    "template": template_doc_summary
}

# Save the updated analysis report to a JSON file
with open("/data/chats/0sf34/workspace/analysis_report.json", "w") as f:
    json.dump(analysis_report, f, indent=4)

NameError: name 'tipo_condicional_summary' is not defined

In [8]:
import pandas as pd
import numpy as np
from docx import Document
import json

# Load and analyze d_entrevistas.csv
df_entrevistas = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_entrevistas.csv", delimiter=';')
entrevistas_summary = {
    "num_rows": len(df_entrevistas),
    "columns": df_entrevistas.columns.tolist(),
    "missing_values": df_entrevistas.isnull().sum().to_dict(),
    "data_types": df_entrevistas.dtypes.astype(str).to_dict()
}

# Load and analyze d_template.csv
df_template = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_template.csv")
template_summary = {
    "num_rows": len(df_template),
    "columns": df_template.columns.tolist(),
    "missing_values": df_template.isnull().sum().to_dict(),
    "data_types": df_template.dtypes.astype(str).to_dict()
}

# Load and analyze f_entrevistas.xlsx
df_f_entrevistas = pd.read_excel("/data/chats/0sf34/workspace/uploads/f_entrevistas.xlsx")
f_entrevistas_summary = {
    "num_rows": len(df_f_entrevistas),
    "columns": df_f_entrevistas.columns.tolist(),
    "missing_values": df_f_entrevistas.isnull().sum().to_dict(),
    "data_types": df_f_entrevistas.dtypes.astype(str).to_dict()
}

# Load and analyze d_condicionais_regras_simples.csv
df_condicionais_simples = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_condicionais_regras_simples.csv", delimiter=';', on_bad_lines='skip')
condicionais_simples_summary = {
    "num_rows": len(df_condicionais_simples),
    "columns": df_condicionais_simples.columns.tolist(),
    "missing_values": df_condicionais_simples.isnull().sum().to_dict(),
    "data_types": df_condicionais_simples.dtypes.astype(str).to_dict()
}

# Load and analyze d_tipo_condicional.csv
df_tipo_condicional = pd.read_csv("/data/chats/0sf34/workspace/uploads/d_tipo_condicional.csv", delimiter=';', on_bad_lines='skip')
tipo_condicional_summary = {
    "num_rows": len(df_tipo_condicional),
    "columns": df_tipo_condicional.columns.tolist(),
    "missing_values": df_tipo_condicional.isnull().sum().to_dict(),
    "data_types": df_tipo_condicional.dtypes.astype(str).to_dict()
}

# Load and analyze Documentação atual do projeto 24.04.2025.docx
doc_project = Document("/data/chats/0sf34/workspace/uploads/Documentação atual do projeto 24.04.2025.docx")
project_doc_summary = {
    "num_paragraphs": len(doc_project.paragraphs),
    "headings": [para.text for para in doc_project.paragraphs if para.style.name.startswith('Heading')],
    "tables": [len(table.rows) for table in doc_project.tables],
    "text": [para.text for para in doc_project.paragraphs]
}

# Load and analyze template.docx
doc_template = Document("/data/chats/0sf34/workspace/uploads/template.docx")
template_doc_summary = {
    "num_paragraphs": len(doc_template.paragraphs),
    "headings": [para.text for para in doc_template.paragraphs if para.style.name.startswith('Heading')],
    "tables": [len(table.rows) for table in doc_template.tables],
    "text": [para.text for para in doc_template.paragraphs]
}

# Combine all summaries into a single report
analysis_report = {
    "d_entrevistas": entrevistas_summary,
    "d_template": template_summary,
    "f_entrevistas": f_entrevistas_summary,
    "d_condicionais_regras_simples": condicionais_simples_summary,
    "d_tipo_condicional": tipo_condicional_summary,
    "Documentação atual do projeto 24.04.2025": project_doc_summary,
    "template": template_doc_summary
}

# Save the analysis report to a JSON file
with open("/data/chats/0sf34/workspace/analysis_report.json", "w") as f:
    json.dump(analysis_report, f, indent=4)