In [2]:
import pandas as pd

# Load the provided CSV file to analyze its structure
file_path = '../resources/cleaned_file.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the data to understand its structure
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71157 entries, 0 to 71156
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        71157 non-null  object
 1   affiliation  71156 non-null  object
 2   year         71157 non-null  int64 
 3   abstract     71147 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.2+ MB


(                                               title  \
 0  The impact of the establishment of carbon emis...   
 1  Research on Agricultural Carbon Emissions and ...   
 2  Environmental Kuznets curve of carbon emission...   
 3  China's total carbon emissions and carbon peak...   
 4  Carbon Sinks and Carbon Emissions Discrepancie...   
 
                                          affiliation  year  \
 0                                   Jinan University  2022   
 1  Faculty of International Trade, Shanxi Univers...  2020   
 2  School of Civil Engineering and Transportation...  2024   
 3        Suzhou University of Science and Technology  2023   
 4                                Tsinghua University  2024   
 
                                             abstract  
 0  The China government focuses on changes in car...  
 1  Carbon emissions and strategies for reducing t...  
 2  Carbon emissions from China’s forest products ...  
 3  Background To cope with the problem of global ..

In [3]:
# from sklearn.feature_extraction.text import CountVectorizer
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# import nltk
# import string

# # Download required NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')

# # Preprocessing function
# def preprocess_text(text):
#     stop_words = set(stopwords.words('english'))
#     # Lowercase text
#     text = text.lower()
#     # Remove punctuation
#     text = text.translate(str.maketrans('', '', string.punctuation))
#     # Tokenize words
#     words = word_tokenize(text)
#     # Remove stop words
#     words = [word for word in words if word not in stop_words]
#     return ' '.join(words)

# # Apply preprocessing to the Abstracts column
# data['Processed_Abstracts'] = data['Abstracts'].apply(preprocess_text)

# # Display processed data
# data[['Abstracts', 'Processed_Abstracts']].head()


In [6]:
data['abstract'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 71157 entries, 0 to 71156
Series name: abstract
Non-Null Count  Dtype 
--------------  ----- 
71147 non-null  object
dtypes: object(1)
memory usage: 556.0+ KB


In [7]:
# Define a basic list of English stop words
import string


basic_stopwords = set([
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", 
    "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can", "can't", 
    "come", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", 
    "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", 
    "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", 
    "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", 
    "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", 
    "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", 
    "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", 
    "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
    "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", 
    "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", 
    "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", 
    "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves","copyright","©" 
])

# Updated preprocessing function
def preprocess_text_basic(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize words
    words = text.split()
    # Remove stop words
    words = [word for word in words if word not in basic_stopwords]
    return ' '.join(words)

# Apply preprocessing to the Abstracts column
data['Processed_Abstracts'] = data['abstract'].apply(lambda x : preprocess_text_basic(str(x)))

# Display processed data
# data[['Abstracts', 'Processed_Abstracts']].head()


# for i in range(0, 180):
#     data['Organizations'][i] = 'Distirct {i}'.format(i=i)

# data.head()
df = pd.DataFrame(data[['affiliation' , 'Processed_Abstracts']])
# Save to CSV
if not df.empty:
    df.to_csv('../resources/cleaned_processdata.csv', index=False, encoding='utf-8')
    print(f"Data saved to 'cleaned_processdata.csv'. Fetched {len(df)} results.")
else:
    print("No data fetched. Check the scraping process.")


Data saved to 'cleaned_processdata.csv'. Fetched 71157 results.


In [6]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize the processed abstracts
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(data['Processed_Abstracts'])

# Fit LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # Assuming 5 topics for simplicity
lda.fit(dtm)

# Extract topic words
def get_topics(model, feature_names, n_top_words=3):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_features)
    return topics

# Get the feature names and topics
feature_names = vectorizer.get_feature_names_out()
topics = get_topics(lda, feature_names)

# Display topics
topics

# df = pd.DataFrame(topics)

# # Save to CSV
# if not df.empty:
#     df.to_csv('topics.csv', index=False, encoding='utf-8')
#     print(f"Data saved to 'topics.csv'. Fetched {len(df)} results.")
# else:
#     print("No data fetched. Check the scraping process.")



[['carbon', 'energy', 'co2'],
 ['energy', 'waste', 'thailand'],
 ['climate', 'change', 'study'],
 ['climate', 'change', 'study'],
 ['co2', 'oil', 'gas']]