In [7]:
import pandas

twitter_messages = pandas.read_csv("datasets/calentamientoClimatico.csv", delimiter=";")

twitter_messages.shape

(4225, 3)

In [8]:
# Creencia -> opinión expresada por el mensaje
# Confianza -> Porcentaje en que la opinión expresada corresponde a la creencia
twitter_messages.head()

Unnamed: 0,TWEET,CREENCIA,CONFIANZA
0,Global warming report urges governments to act...,Yes,1.0
1,Fighting poverty and global warming in Africa ...,Yes,1.0
2,Carbon offsets: How a Vatican forest failed to...,Yes,0.8786
3,Carbon offsets: How a Vatican forest failed to...,Yes,1.0
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,Yes,0.8087


In [9]:
# Pasar característica "Creencia" numérica
twitter_messages['CREENCIA'] = (twitter_messages['CREENCIA'] == 'Yes').astype(int)

twitter_messages.head(10)

Unnamed: 0,TWEET,CREENCIA,CONFIANZA
0,Global warming report urges governments to act...,1,1.0
1,Fighting poverty and global warming in Africa ...,1,1.0
2,Carbon offsets: How a Vatican forest failed to...,1,0.8786
3,Carbon offsets: How a Vatican forest failed to...,1,1.0
4,URUGUAY: Tools Needed for Those Most Vulnerabl...,1,0.8087
5,RT @sejorg: RT @JaymiHeimbuch: Ocean Saltiness...,1,1.0
6,Global warming evidence all around us|A messag...,1,1.0
7,Migratory Birds' New Climate Change Strategy: ...,1,1.0
8,Southern Africa: Competing for Limpopo Water: ...,1,1.0
9,"Global warming to impact wheat, rice productio...",1,1.0


In [10]:
import re

def normalize(message):
    message = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', message)
    message = re.sub('@[^\s]+', 'USER', message)
    message = message.lower().replace("ë", "e")
    message = re.sub('[^a-zA-Za-яA-Я1-9]', ' ', message)
    message = re.sub(' +', ' ', message)
    return message.strip()

In [11]:
# Aplicar función de normalización
twitter_messages["TWEET"] = twitter_messages["TWEET"].apply(normalize)

twitter_messages.head(10)

Unnamed: 0,TWEET,CREENCIA,CONFIANZA
0,global warming report urges governments to act...,1,1.0
1,fighting poverty and global warming in africa ...,1,1.0
2,carbon offsets how a vatican forest failed to ...,1,0.8786
3,carbon offsets how a vatican forest failed to ...,1,1.0
4,uruguay tools needed for those most vulnerable...,1,0.8087
5,rt user rt user ocean saltiness shows global w...,1,1.0
6,global warming evidence all around us|a messag...,1,1.0
7,migratory birds new climate change strategy st...,1,1.0
8,southern africa competing for limpopo water cl...,1,1.0
9,global warming to impact wheat rice production...,1,1.0


In [12]:
# Descargar y cargar stopwords
#import nltk
from nltk.corpus import stopwords

#nltk.download('stopwords')

stopwords_loaded = stopwords.words('english')

In [14]:
# Eliminación de las stop words
twitter_messages['TWEET'] = twitter_messages['TWEET'].apply(lambda message: ' '.join([word for word in message.split() if word not in (stopwords_loaded)]))

twitter_messages['TWEET']

0       global warming report urges governments act|br...
1           fighting poverty global warming africa [link]
2       carbon offsets vatican forest failed reduce gl...
3       carbon offsets vatican forest failed reduce gl...
4       uruguay tools needed vulnerable climate change...
                              ...                        
4220    83 _á climbing nyc august weather first day ma...
4221    user phrase global warming abandoned favor cli...
4222       global warming tube parody enjoy ipcc ocra url
4223    one eyed golfer dare tell global warming twent...
4224    man made global warming hair brained theory 4 ...
Name: TWEET, Length: 4225, dtype: object

In [16]:
# Stemming

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

twitter_messages['TWEET'] = twitter_messages['TWEET'].apply(lambda message: ' '.join([stemmer.stem(word) for word in message.split(' ')]))

twitter_messages.head()

Unnamed: 0,TWEET,CREENCIA,CONFIANZA
0,global warm report urg govern act|brussel belg...,1,1.0
1,fight poverti global warm africa [link],1,1.0
2,carbon offset vatican forest fail reduc global...,1,0.8786
3,carbon offset vatican forest fail reduc global...,1,1.0
4,uruguay tool need vulner climat chang [link],1,0.8087


In [17]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jorge\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jorge\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [19]:
# Lematización

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

twitter_messages['TWEET'] = twitter_messages['TWEET'].apply(lambda message: ' '.join([lemmatizer.lemmatize(word) for word in message.split(' ')]))

twitter_messages

Unnamed: 0,TWEET,CREENCIA,CONFIANZA
0,global warm report urg govern act|brussel belg...,1,1.0000
1,fight poverti global warm africa [link],1,1.0000
2,carbon offset vatican forest fail reduc global...,1,0.8786
3,carbon offset vatican forest fail reduc global...,1,1.0000
4,uruguay tool need vulner climat chang [link],1,0.8087
...,...,...,...
4220,83 _á climb nyc august weather first day may u...,1,1.0000
4221,user phrase global warm abandon favor climat c...,1,1.0000
4222,global warm tube parodi enjoy ipcc ocra url,0,0.6411
4223,one eye golfer dare tell global warm twenti fi...,0,1.0000


In [20]:
# Distribución en conjunto de pruebas y aprendizaje

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(twitter_messages['TWEET'].values, twitter_messages['CREENCIA'].values, test_size=0.2)

In [22]:
# Crear canalización

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

learning_stages = Pipeline([('frecuence', CountVectorizer()), ('tfidf', TfidfTransformer()), ('algorithm', MultinomialNB())])

In [23]:
# Iniciar fase de aprendizaje

model = learning_stages.fit(x_train, y_train)

from sklearn.metrics import classification_report

report = classification_report(y_test, model.predict(x_test), digits=4)

print(report)

              precision    recall  f1-score   support

           0     0.8378    0.2756    0.4147       225
           1     0.7886    0.9806    0.8742       620

    accuracy                         0.7929       845
   macro avg     0.8132    0.6281    0.6445       845
weighted avg     0.8017    0.7929    0.7518       845



In [24]:
phrase = "Why should trust scientists with global warming if they didnt know Pluto wasnt a planet"

# Normalización
phrase = normalize(phrase)

# Eliminar stop words
phrase = ' '.join([word for word in phrase.split() if word not in (stopwords_loaded)])

# Aplicación de stemming
phrase = ' '.join([stemmer.stem(word) for word in phrase.split(' ')])

# Lematización
phrase = ' '.join([lemmatizer.lemmatize(word) for word in phrase.split(' ')])

prediction = model.predict([phrase])
print(prediction)
if (prediction[0] == 0):
    print(">> No cree en el calentamiento climático...")
else:
    print(">> Cree en el calentamiento climático...")


[1]
>> Cree en el calentamiento climático...
