In [5]:
# STEP 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 2: Import necessary libraries
import json
import pandas as pd

# STEP 3: Load the JSON file
json_path = "/content/drive/MyDrive/reviews.json"  # adjust if needed
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# STEP 4: Flatten the nested structure into a DataFrame
flattened_reviews = []

for paper in data['paper']:
    paper_id = paper.get('id')
    preliminary_decision = paper.get('preliminary_decision')

    for review in paper['review']:
        flattened_review = {
            'paper_id': paper_id,
            'preliminary_decision': preliminary_decision,
            'review_id': review.get('id'),
            'confidence': review.get('confidence'),
            'evaluation': review.get('evaluation'),
            'orientation': review.get('orientation'),
            'language': review.get('lan'),
            'remarks': review.get('remarks'),
            'text': review.get('text'),
            'timespan': review.get('timespan')
        }
        flattened_reviews.append(flattened_review)

# STEP 5: Create the DataFrame
df = pd.DataFrame(flattened_reviews)
print("✅ Flattened DataFrame created. Shape:", df.shape)
print(df.head(2))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Flattened DataFrame created. Shape: (405, 10)
   paper_id preliminary_decision  review_id confidence evaluation orientation  \
0         1               accept          1          4          1           0   
1         1               accept          2          4          1           1   

  language remarks                                               text  \
0       es          - El artículo aborda un problema contingente y...   
1       es          El artículo presenta recomendaciones prácticas...   

     timespan  
0  2010-07-05  
1  2010-07-05  


STEP 2: TEXT CLEANING & PREPROCESSING

In [2]:
# STEP 6: Install and import NLP libraries
!pip install nltk spacy
!python -m spacy download es_core_news_sm
!python -m spacy download en_core_web_sm

import string
import re
import spacy
import nltk
from nltk.corpus import stopwords

# STEP 7: Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# STEP 8: Load language models and stopwords
nlp_es = spacy.load('es_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

spanish_stopwords = set(stopwords.words('spanish'))
english_stopwords = set(stopwords.words('english'))

# STEP 9: Clean & preprocess text
def preprocess_text(text, language='es'):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()

    if language == 'es':
        doc = nlp_es(text)
        stopwords_lang = spanish_stopwords
    else:
        doc = nlp_en(text)
        stopwords_lang = english_stopwords

    tokens = [token.lemma_ for token in doc if token.text not in stopwords_lang and not token.is_space]
    return ' '.join(tokens)


Collecting es-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencie

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


APPLY CLEANING + REMOVE EMPTY REVIEWS

In [4]:
# STEP 10: Split data by language
df_spanish = df[df['language'] == 'es'].copy()
df_english = df[df['language'] == 'en'].copy()

# STEP 11: Apply preprocessing
df_spanish['cleaned_text'] = df_spanish['text'].apply(preprocess_text, language='es')
df_english['cleaned_text'] = df_english['text'].apply(preprocess_text, language='en')

# STEP 12: Combine preprocessed data
df_cleaned = pd.concat([
    df_spanish[['text', 'cleaned_text', 'language', 'confidence', 'evaluation', 'orientation']],
    df_english[['text', 'cleaned_text', 'language', 'confidence', 'evaluation', 'orientation']]
], ignore_index=True)

# STEP 13: Create empty flag and drop truly empty reviews
df_cleaned['empty_cleaned'] = df_cleaned['cleaned_text'].apply(lambda x: x.strip() == "")
df_cleaned = df_cleaned[df_cleaned['empty_cleaned'] == False].reset_index(drop=True)

print("✅ Cleaned reviews shape:", df_cleaned.shape)
print(df_cleaned[['cleaned_text', 'language', 'orientation']].head())



#Length-Based Features

# Add word count and char count
df_cleaned['word_count'] = df_cleaned['cleaned_text'].apply(lambda x: len(str(x).split()))
df_cleaned['char_count'] = df_cleaned['text'].apply(lambda x: len(str(x)))

# Preview
print(df_cleaned[['cleaned_text', 'word_count', 'char_count']].head())

✅ Cleaned reviews shape: (399, 7)
                                        cleaned_text language orientation
0  artículo abordar problema contingente relevant...       es           0
1  artículo presentar recomendación práctico desa...       es           1
2  tema interesante poder ser mucho ayuda guía in...       es           1
3  explicar forma ordenado didáctico experiencia ...       es           1
4  autor describir metodología desarrollar forma ...       es           0
                                        cleaned_text  word_count  char_count
0  artículo abordar problema contingente relevant...          45         575
1  artículo presentar recomendación práctico desa...          50         618
2  tema interesante poder ser mucho ayuda guía in...         118        1259
3  explicar forma ordenado didáctico experiencia ...         119        1350
4  autor describir metodología desarrollar forma ...         162        1938
