In [4]:
import pandas as pd

# Load the provided CSV file to analyze its structure
file_path = 'resources/Final.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the data to understand its structure
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4921 entries, 0 to 4920
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        4921 non-null   object 
 1   affiliation  4921 non-null   object 
 2   year         4921 non-null   int64  
 3   abstract     4919 non-null   object 
 4   latitude     4921 non-null   float64
 5   longitude    4921 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 230.8+ KB


(                                               title  \
 0  The impact of the establishment of carbon emis...   
 1  China's total carbon emissions and carbon peak...   
 2  Carbon Sinks and Carbon Emissions Discrepancie...   
 3  Enablers, Challenges, and Carbon Emissions of ...   
 4  The impact of environmental regulations on car...   
 
                                          affiliation  year  \
 0                                   Jinan University  2022   
 1        Suzhou University of Science and Technology  2023   
 2                                Tsinghua University  2024   
 3                               Chongqing University  2023   
 4  Kraków University of Economics: Uniwersytet Ek...  2024   
 
                                             abstract   latitude   longitude  
 0  The China government focuses on changes in car...  22.251825  113.529126  
 1  Background To cope with the problem of global ...  31.251764  120.572397  
 2  Assessing discrepancies between car

In [8]:
import string
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

# Define basic stop words
basic_stopwords = set([
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", 
    "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "can't", 
    "come", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", 
    "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", 
    "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", 
    "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", 
    "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", 
    "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", 
    "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", 
    "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
    "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", 
    "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", 
    "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", 
    "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves","copyright","©" ,"±",
     "c","n", "p"
])

# Function to detect language
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

# Updated preprocessing function
def preprocess_text_basic(text):
    # Detect language first
    if not is_english(text):
        return ''  # Return an empty string for non-English text
    # Lowercase text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize words
    words = text.split()
    # Remove stop words
    words = [word for word in words if word not in basic_stopwords]
    return ' '.join(words)

# Apply preprocessing to the abstracts column
data['Processed_Abstracts'] = data['abstract'].apply(lambda x: preprocess_text_basic(str(x)))

# Display processed data
# data[['abstract', 'Processed_Abstracts']].head()


In [9]:
df = pd.DataFrame(data[['affiliation' , 'Processed_Abstracts']])
# Save to CSV
if not df.empty:
    df.to_csv('resources/final_processdata.csv', index=False, encoding='utf-8')
    print(f"Data saved to 'final_processdata.csv'. Fetched {len(df)} results.")
else:
    print("No data fetched. Check the scraping process.")


Data saved to 'final_processdata.csv'. Fetched 4921 results.
