In [1]:
import pandas as pd
import re
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


tokenizerSumLegal = AutoTokenizer.from_pretrained("nsi319/legal-led-base-16384")  
modelSumLegal = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-led-base-16384")
summarizerFacebook = pipeline("summarization", model="facebook/bart-large-cnn")

################### LAWMARKCASES ##################################
FILE_LAWMARK = '/home/jaco/Projetos/landMarkClassification/data/scraper/scrapee.csv'

dfLawMark = pd.read_csv(FILE_LAWMARK)

dfLawMark['text'] = dfLawMark['text'].astype(str)
dfLawMark['subject'] = dfLawMark['subject'].astype(str)

#dfLawMark['text'] = dfLawMark['text'].str.replace(r'\n',' ',regex=True)
#dfLawMark['text'] = dfLawMark['text'].str.replace(r'\\',' ',regex=True)
#dfLawMark['text'] = dfLawMark['text'].str.replace(r'\'',' ',regex=True)

def cleanHeaderFooter(text):
    return text[169:-124] ### no olho mesmo

dfLawMark['text'] = dfLawMark['text'].apply(cleanHeaderFooter)
dfLawMark = dfLawMark[['text','subject']]

dfLawMark = dfLawMark[(dfLawMark['text'] != '')] ### checar depois pq aparece isso 
dfLawMark = dfLawMark[dfLawMark["text"].apply(lambda x: len(x) > 1000)] ### checar depois pq aparece isso

print(f'tamanho do dataset: {len(dfLawMark)}')

### checar se o webscrape ta perfeito vs o site.
### pegar um sumarizador profissional law:

#dfLawMark = dfLawMark.groupby(['subject'])['text'].apply(','.join).reset_index()

  from .autonotebook import tqdm as notebook_tqdm


tamanho do dataset: 2271


In [2]:
#dfLawMark = dfLawMark.head(5)

In [3]:
def summary_legal_text(row):

    MAX_SUM_LEGAL_SIZE = 8000

    padding = "max_length"
    text = row['text'][0:MAX_SUM_LEGAL_SIZE]
    num_words = len(text.split())

    input_tokenized = tokenizerSumLegal.encode(text, return_tensors='pt',padding=padding,pad_to_max_length=True, max_length=6144,truncation=True)
    summary_ids = modelSumLegal.generate(input_tokenized,
                                    num_beams=4,
                                    no_repeat_ngram_size=3,
                                    length_penalty=2,
                                    min_length=int(num_words/2),
                                    max_length=num_words)
    summary = [tokenizerSumLegal.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]

    return summary

def summary_text(row):

    MAX_FACEBOOK_SIZE = 4000
    
    text = row['text'][0:MAX_FACEBOOK_SIZE] #z?
    num_words = int(len(text.split())*1.3)

    texto_sum = summarizerFacebook(text, max_length=num_words, min_length=int(num_words/2), do_sample=False)

    return texto_sum[0]['summary_text']

def get_head(row):

    end_text = 'delivered the opinion of the Court.'
    start_text = 'Syllabus'

    start_pos = row['text'].find(start_text) 
    end_pos = row['text'].find(end_text) 

    if start_pos < 0:
        start_pos = 0
    if end_pos < 0:
        end_pos = start_pos+5000

    text = row['text'][start_pos+len(start_text):end_pos+len(end_text)]
    text = re.sub(' +', ' ', text) #remove any double space, 'text trim'

    if len(text) < 1000:
        text = row['text']

    return text

dfLawMark['Syllabus'] = dfLawMark.apply(lambda row : get_head(row), axis=1)
#dfLawMark['SummaryText'] = dfLawMark.apply(lambda row : summary_text(row), axis=1)
#dfLawMark['SummaryLegalText'] = dfLawMark.apply(lambda row : summary_legal_text(row), axis=1)

In [4]:
## tirando classes sem samples:
min_sample_size = 14
df2 = dfLawMark.groupby(['subject'],as_index=False).count()
df2 = df2[df2['text'] >= min_sample_size]
lista = list(df2['subject'])

#dfLawMark = dfLawMark[dfLawMark['subject'].isin(lista)]

##testando classificações mais facieis:
dfLawMark = dfLawMark[dfLawMark['subject'].isin(['First Amendment','Freedom of Speech','Criminal Law','Racial Discrimination'])]

encodedClass = pd.factorize(dfLawMark['subject'])
dfLawMark['encoded'] = encodedClass[0]+1 ## pytorch precisa começar do 1

dfEnCode = dfLawMark[['subject','encoded']]
dfEnCode.drop_duplicates(inplace=True)

dfEnCode.to_csv('/home/jaco/Projetos/landMarkClassification/data/enCode.csv',index=False)

dfLawMark = dfLawMark.sample(frac=1)

dfLawMark.to_csv('/home/jaco/Projetos/landMarkClassification/data/onlyLandMarkWSyllabus.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfEnCode.drop_duplicates(inplace=True)


In [5]:
from sklearn.model_selection import train_test_split

X = dfLawMark['Syllabus']
y = dfLawMark['encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)#, stratify=y)

dic_train = {'Syllabus':X_train,'encoded':y_train}
dic_test = {'Syllabus':X_test,'encoded':y_test}

dfLawMark_Train = pd.DataFrame(dic_train)
dfLawMark_Test = pd.DataFrame(dic_test)


dfLawMark_Train.to_csv('/home/jaco/Projetos/landMarkClassification/data/onlyLandMarkWSyllabus_Train.csv',index=False)
dfLawMark_Test.to_csv('/home/jaco/Projetos/landMarkClassification/data/onlyLandMarkWSyllabus_Test.csv',index=False)

In [9]:
ag_test = pd.read_csv('/home/jaco/Projetos/landMarkClassification/data/agnews/test.csv')
ag_train = pd.read_csv('/home/jaco/Projetos/landMarkClassification/data/agnews/train.csv')

ag_test = ag_test.sample(n=300)
ag_train = ag_test.sample(n=100)

ag_test = ag_test.to_csv('/home/jaco/Projetos/landMarkClassification/data/agnews/test_red.csv',index=False)
ag_train = ag_train.to_csv('/home/jaco/Projetos/landMarkClassification/data/agnews/train_red.csv',index=False)

In [6]:
dfLawMark_Train.head(5)

Unnamed: 0,Syllabus,encoded
799,333 U.S. 203 68 S.Ct. 461 92 L.Ed. 649 PEOPLE ...,2
1160,"The University of Missouri at Kansas City, a ...",3
1828,334 U.S. 1 68 S.Ct. 836 92 L.Ed. 1161 SHELLEY ...,4
1080,249 U.S. 47 39 S.Ct. 247 63 L.Ed. 470 SCHENCKv...,3
329,Respondent was arrested in the front yard of ...,1
