In [None]:
#importing libraries

In [None]:
#importing libraries
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

In [None]:
#loading data
data=pd.read_csv('train.csv')
data

In [None]:
#shape of data
data.shape

In [None]:
#checking columns
data.columns

In [None]:
#checking data type of columns
data.dtypes

In [None]:
#checking missing values
data.isnull().sum()

In [None]:
#label distribution
data['label'].value_counts()

In [None]:
#input feature and target variable
x=data['content']
y=data['label']

In [None]:
#before cleaning
data['content']

In [None]:
#text normalization on feature data
#converting into string and lowercase
data['clean_content']=data['content'].str.lower()
#removing punctuation
data['clean_content']=data['clean_content'].apply(lambda x:re.sub(r'[^a-zA-Z0-9\s]','',x))
#removing spaces
data['clean_content']=data['clean_content'].str.strip()

In [None]:
#after cleaning
data['clean_content']

In [None]:
#converting text into numbers
#TF-IDF tool
tfidf=TfidfVectorizer(max_features=3000,ngram_range=(1,2),stop_words='english',min_df=2)
x=tfidf.fit_transform(data['clean_content'])

In [None]:
#model
model=LogisticRegression(max_iter=1000,class_weight='balanced')

In [None]:
#model fitting on entire data
model.fit(x,y)

In [None]:
#splitting training data,testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=40)

In [None]:
#train baseline model
model=LogisticRegression(max_iter=1000,class_weight='balanced')

In [None]:
#model fitting on my splitted data
model.fit(x_train,y_train)

In [None]:
y_pred=model.predict(x_test)
y_pred

In [None]:
#checking my model performance
print('Accuracy: ',accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
#chunk_text
def chunk_text(text,chunk_size=2000,overlap=200):
    chunks=[]
    start=0
    while start<len(text):
        end=start+chunk_size
        chunks.append(text[start:end])
        start+=chunk_size-overlap
    return chunks 
data['chunks'] = data['clean_content'].apply(chunk_text)
print(len(data['chunks'][0]))

In [None]:
#flatten chunks into rows
rows = []
for idx, chunk_list in enumerate(data['chunks']):
    for chunk in chunk_list:
        rows.append({
            'chunk_text': chunk,
            'label': data['label'][idx],
            'story_id': idx
        })
chunk_df = pd.DataFrame(rows)

In [None]:
#fit_tranform my chunk feature
x_chunks=tfidf.transform(chunk_df['chunk_text'])
y_chunks=chunk_df['label']

In [None]:
#chunk level model
model.fit(x_chunks,y_chunks)

In [None]:
#predicting chunk text
chunk_df['chunk_pred']=model.predict(x_chunks)

In [None]:
#seeing column in dataframe
chunk_df

In [None]:
#checking my chunk level performance
print('Accuracy: ',accuracy_score(y_chunks,chunk_df['chunk_pred']))
print(classification_report(y_chunks,chunk_df['chunk_pred']))

In [None]:
#story level chunk
chunk_df['chunk_pred'].unique()

In [None]:
label_map={
    'consistent':1,
    'contradict':0
}

In [None]:
chunk_df['chunk_pred']=chunk_df['chunk_pred'].map(label_map)

In [None]:
true_labels = chunk_df.groupby(chunk_df.index)['label'].first()
chunk_df['chunk_pred'] = chunk_df['chunk_pred'].astype(int)
final_story_pred = {}
for story_id, group in chunk_df.groupby('story_id'):
    mean_value = group['chunk_pred'].mean()
    final_story_pred[story_id] = 1 if mean_value >= 0.5 else 0
final_story_pred = pd.Series(final_story_pred)

In [None]:
#checking my story level performance
print('Accuracy: ',accuracy_score(chunk_df['chunk_pred'],final_story_pred))
print(classification_report(chunk_df['chunk_pred'],final_story_pred))

In [None]:
#Now test_data
test_data=pd.read_csv('test.csv')
test_data

In [None]:
#shape of test_data
test_data.shape

In [None]:
#columns in test_data
test_data.columns

In [None]:
#columns data type in test_data
test_data.dtypes

In [None]:
#missing values in test_data
test_data.isnull().sum()

In [None]:
#text normalization on feature test_data
#converting into string and lowercase
test_data['clean_content']=test_data['content'].str.lower()
#removing punctuation
test_data['clean_content']=test_data['content'].apply(lambda x:re.sub(r'[^a-zA-Z0-9\s]','',x))
#removing spaces
test_data['clean_content']=test_data['clean_content'].str.strip()
#converting into int
test_data['clean_content'] = test_data['clean_content'].astype(str)
#chunk test text
test_data['chunks'] = test_data['clean_content'].apply(chunk_text)

In [None]:
#flatten test chunk into rows
test_rows = []
for idx, chunk_list in enumerate(test_data['chunks']):
    for chunk in chunk_list:
        test_rows.append({
            'chunk_text': chunk,
            'story_id': idx
        })
test_chunk_df = pd.DataFrame(test_rows)

In [None]:
#tranform my chunk feature in test_data
x_test_chunks = tfidf.transform(test_chunk_df['chunk_text'])

In [None]:
#predictions
test_chunk_df['chunk_pred']=model.predict(x_test_chunks)

In [None]:
label_map={
    'consistent':1,
    'contradict':0
}

In [None]:
test_chunk_df['chunk_pred']=test_chunk_df['chunk_pred'].map(label_map)

In [None]:
#aggregate to story level
test_story_pred = (
    test_chunk_df
    .groupby('story_id')['chunk_pred']
    .mean()
    .apply(lambda x: 1 if x >= 0.5 else 0)
    .reset_index()
)

In [None]:
#convert numbers into text
label_map = {1: "consistent", 0: "contradict"}
results_fixed = test_chunk_df.copy()
results_fixed['prediction'] = results_fixed['chunk_pred'].map(label_map)
results_fixed=results_fixed[['story_id','prediction']]

In [None]:
#results 
results_fixed.to_csv('results.csv',index=False)